Taming Your Music Collection: ID3 From Web

Retrieving The ID3 Info
     Now that we've established our files are missing ID3 info and there is no reliable means of determining it from the file path, we must look to the web for our data. We'll use the MusicBrainz XML Web Service to search for and fill in our missing data. You can find out more about their web service at http://musicbrainz.org/doc/XML_Web_Service/Version_2. To make the calls to the service, we will use the urllib and the urllib2 packages.
Code:
#!/usr/bin/python

import sys, os, fnmatch, shutil, argparse, re, urllib, urllib2
from mutagen.easyid3 import EasyID3
from mutagen.id3 import ID3, TIT2

def getTrackNumber(track):
    if track.find("/") > -1:
        return getTrackNumber(track[0:track.find("/")])
    elif track.isdigit():
        return track.zfill(2)

def getID3FromFilename(file, id3info):
    filename = os.path.basename(file)
    m = re.match(r"(?P<trackNumber>\d{2})\. (?P<artist>((?! -).)+) - (?P<title>[^\.]+)\.mp3", filename)

    id3info["tracknumber"] = m.group('trackNumber')
    id3info["artist"] = m.group('artist')
    id3info["title"] = m.group('title')
    id3info["album"] = file.split('/')[-2]

    id3info.save()

def getID3FromWeb(file, id3info):
    # Get the Artist and Title from id3info to make the query
    _artist = id3info["artist"][0]
    _title = id3info["title"][0]

    # Query the MusicBrainz web service
    try:
        query = { 'query' : 'artist:' + _artist + ' AND recording:' + _title }
        response = urllib2.urlopen('http://musicbrainz.org/ws/2/recording?' + urllib.urlencode(query))
        print response.read()
    except:
        print sys.exc_info()[0]

def move(output, file, dirMatch):
    # Check if there are ID3 tags to begin with, if not, it will complain
    try:
        tag = ID3(file)
    except:
        tag = ID3()
        tag.add(TIT2(encoding=3, text=["Title"]))
        tag.save(file)

    id3info = EasyID3(file)

    try:
        _trackNumber = getTrackNumber(id3info["tracknumber"][0])
        _artist = id3info["artist"][0]
        _title = id3info["title"][0]
        _album = id3info["album"][0]
    except KeyError:
        if dirMatch:
            getID3FromFilename(file, id3info)
            return
        else:
            getID3FromWeb(file, id3info)
            return

    outputDir = output + "/" + _artist + "/" + _album + "/"
    outputFile = _trackNumber + ". " + _artist + " - " + _title + ".mp3"

    if not os.path.exists(outputDir):
        os.makedirs(outputDir)

    shutil.move(file, outputDir + outputFile)

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-d', '--directory', nargs=1, required=True, help='')
    parser.add_argument('-o', '--output', nargs=1, required=True, help='')

    args = parser.parse_args()

    directory = os.path.abspath(args.directory[0])
    output = os.path.abspath(args.output[0])

    dirMatch = (directory == output)

    for root, subFolders, filenames in os.walk(directory):
        for filename in fnmatch.filter(filenames, '*.mp3'):
            move(output, os.path.join(root, filename), dirMatch)

main()
Result: Note: Due to the large amount of data returned from the XML web service, I will link to each file's results rather than display pages and pages of text.
$ ./mp3-tagger.py -d /UntamedMusic/ -o /Music/
http://musicbrainz.org/ws/2/recording?query=artist%3AThrice+AND+recording%3AMoving+Mountains
http://musicbrainz.org/ws/2/recording?query=artist%3AThrice+AND+recording%3AChild+Of+Dust
http://musicbrainz.org/ws/2/recording?query=artist%3AThrice+AND+recording%3AThe+Earth+Isnt+Humming
http://musicbrainz.org/ws/2/recording?query=artist%3AThrice+AND+recording%3ADigging+My+Own+Grave
http://musicbrainz.org/ws/2/recording?query=artist%3AThrice+AND+recording%3AThe+Lion+And+The+Wolf
http://musicbrainz.org/ws/2/recording?query=artist%3AThrice+AND+recording%3ACome+All+You+Weary
     Next, we will be using the xml.dom.minidom package to parse the XML returned from the MusicBrainz service. You can learn more about the xml.dom.minidom package at http://docs.python.org/library/xml.dom.minidom.html and XML in general from http://www.w3schools.com/xml/xml_whatis.asp.
Code:
#!/usr/bin/python

import sys, os, fnmatch, shutil, argparse, re, urllib, urllib2, xml.dom.minidom
from mutagen.easyid3 import EasyID3
from mutagen.id3 import ID3, TIT2

def getTrackNumber(track):
    if track.find("/") > -1:
        return getTrackNumber(track[0:track.find("/")])
    elif track.isdigit():
        return track.zfill(2)

def getID3FromFilename(file, id3info):
    filename = os.path.basename(file)
    m = re.match(r"(?P<trackNumber>\d{2})\. (?P<artist>((?! -).)+) - (?P<title>[^\.]+)\.mp3", filename)

    id3info["tracknumber"] = m.group('trackNumber')
    id3info["artist"] = m.group('artist')
    id3info["title"] = m.group('title')
    id3info["album"] = file.split('/')[-2]

    id3info.save()

def getID3FromWeb(file, id3info):
    # Get the Artist and Title from id3info to make the query
    _artist = id3info["artist"][0]
    _title = id3info["title"][0]

    # Query the MusicBrainz web service
    try:
        query = { 'query' : 'artist:' + _artist + ' AND recording:' + _title }
        response = urllib2.urlopen('http://musicbrainz.org/ws/2/recording?' + urllib.urlencode(query))
        x = xml.dom.minidom.parse(response)

        recordingList = x.getElementsByTagNameNS('http://musicbrainz.org/ns/mmd-2.0#', 'recording-list')[0]
        recording = recordingList.getElementsByTagName('recording')[0]
        if recording.nodeType == 1 and recording.attributes.get('ext:score').value == '100':
            id3info["tracknumber"] = getTrackNumber(str(int(recording.getElementsByTagName('track-list')[0].attributes.get('offset').value)+1))
            id3info["artist"] = recording.getElementsByTagName('name')[0].firstChild.nodeValue
            id3info["title"] = recording.getElementsByTagName('title')[0].firstChild.nodeValue
            id3info["album"] = recording.getElementsByTagName('release')[0].getElementsByTagName('title')[0].firstChild.nodeValue

            print id3info
    except:
        print sys.exc_info()[0]

def move(output, file, dirMatch):
    # Check if there are ID3 tags to begin with, if not, it will complain
    try:
        tag = ID3(file)
    except:
        tag = ID3()
        tag.add(TIT2(encoding=3, text=["Title"]))
        tag.save(file)

    id3info = EasyID3(file)

    try:
        _trackNumber = getTrackNumber(id3info["tracknumber"][0])
        _artist = id3info["artist"][0]
        _title = id3info["title"][0]
        _album = id3info["album"][0]
    except KeyError:
        if dirMatch:
            getID3FromFilename(file, id3info)
            return
        else:
            getID3FromWeb(file, id3info)
            return

    return

    outputDir = output + "/" + _artist + "/" + _album + "/"
    outputFile = _trackNumber + ". " + _artist + " - " + _title + ".mp3"

    if not os.path.exists(outputDir):
        os.makedirs(outputDir)

    shutil.move(file, outputDir + outputFile)

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-d', '--directory', nargs=1, required=True, help='')
    parser.add_argument('-o', '--output', nargs=1, required=True, help='')

    args = parser.parse_args()

    directory = os.path.abspath(args.directory[0])
    output = os.path.abspath(args.output[0])

    dirMatch = (directory == output)

    for root, subFolders, filenames in os.walk(directory):
        for filename in fnmatch.filter(filenames, '*.mp3'):
            move(output, os.path.join(root, filename), dirMatch)

main()
Result:
$ ./mp3-tagger.py -d /UntamedMusic/ -o /Music/
{'album': [u'The Alchemy Index, Volumes III & IV'], 'tracknumber': [u'01'], 'title': [u'Moving Mountains'], 'artist': [u'Thrice']}
{'album': [u'The Alchemy Index, Volumes III & IV'], 'tracknumber': [u'06'], 'title': [u'Child of Dust'], 'artist': [u'Thrice']}
{'album': [u'The Alchemy Index, Volumes III & IV'], 'tracknumber': [u'03'], 'title': [u"The Earth Isn't Humming"], 'artist': [u'Thrice']}
{'album': [u'The Alchemy Index, Volumes III & IV'], 'tracknumber': [u'02'], 'title': [u'Digging My Own Grave'], 'artist': [u'Thrice']}
{'album': [u'The Alchemy Index, Volumes III & IV'], 'tracknumber': [u'04'], 'title': [u'The Lion and the Wolf'], 'artist': [u'Thrice']}
{'album': [u'Come All You Weary'], 'tracknumber': [u'01'], 'title': [u'Come All You Weary'], 'artist': [u'Thrice']}
     Looks like we've run into a problem. One of our tracks appears on multiple albums and the first option isn't the one we wanted. We could solve this in a number of ways.
  1. Try to extract more information from the file path or ID3 data.
    This would require that such information is available and consistent. Because our music collection could potentially be in any file structure, we cannot rely on extracting more data from the file path. We also cannot count on any more ID3 data, as the whole purpose of using the web service is to fill in data we do not have.
  2. Use the files in the same directory to determine ID3 data. For example, 5 out of the 6 tracks on the album I am using come from the same album, thus the 6th track must come from that album as well.
    Again, this requires some form of file structure. If a user has all of their mp3 files in a single folder, this method would not work.
  3. Leave the file untouched, leaving the user to manually fix any mp3 files that cannot reliably be tagged and moved.
    We want to automate as much as possible, so this option seems counter to our goals.
  4. Present the options to the user and have him or her select between them.
    This is the one I am most fond of. The user can look at the file path and determine which is the most appropriate selection for the file. This should prevent any errors and only needs user interaction if files are missing ID3 info and appear on multiple albums.
     Let's try to determine when there are multiple options for the album and give the user the option to choose between them.
Code:
#!/usr/bin/python

import sys, os, fnmatch, shutil, argparse, re, urllib, urllib2, xml.dom.minidom
from mutagen.easyid3 import EasyID3
from mutagen.id3 import ID3, TIT2

def getTrackNumber(track):
    if track.find("/") > -1:
        return getTrackNumber(track[0:track.find("/")])
    elif track.isdigit():
        return track.zfill(2)

def getID3FromFilename(file, id3info):
    filename = os.path.basename(file)
    m = re.match(r"(?P<trackNumber>\d{2})\. (?P<artist>((?! -).)+) - (?P<title>[^\.]+)\.mp3", filename)

    id3info["tracknumber"] = m.group('trackNumber')
    id3info["artist"] = m.group('artist')
    id3info["title"] = m.group('title')
    id3info["album"] = file.split('/')[-2]

    id3info.save()

def getID3FromWeb(file, id3info):
    # Get the Artist and Title from id3info to make the query
    _artist = id3info["artist"][0]
    _title = id3info["title"][0]

    # Query the MusicBrainz web service
    try:
        query = { 'query' : 'artist:' + _artist + ' AND recording:' + _title }
        response = urllib2.urlopen('http://musicbrainz.org/ws/2/recording?' + urllib.urlencode(query))
        x = xml.dom.minidom.parse(response)

        recordingList = x.getElementsByTagNameNS('http://musicbrainz.org/ns/mmd-2.0#', 'recording-list')[0]
        recording = recordingList.getElementsByTagName('recording')[0]
        if recording.nodeType == 1 and recording.attributes.get('ext:score').value == '100':
            id3info["tracknumber"] = getTrackNumber(str(int(recording.getElementsByTagName('track-list')[0].attributes.get('offset').value)+1))
            id3info["artist"] = recording.getElementsByTagName('name')[0].firstChild.nodeValue
            id3info["title"] = recording.getElementsByTagName('title')[0].firstChild.nodeValue

            # Check for multiple releases containing the title
            releaseList = recording.getElementsByTagName('release-list')[0]
            releases = releaseList.getElementsByTagName('release')

            releaseOpts = []

            for release in releases:
                releaseTitle = release.getElementsByTagName('title')[0].firstChild.nodeValue

                # If the release is not already in the list of options, add it
                if releaseTitle not in releaseOpts:
                    releaseOpts.append(releaseTitle)

            # If more than one release option exists, ask the user to pick one
            if len(releaseOpts) > 1:
                # Ask the user which option they'd prefer
                print file + " has multiple options:"
                index = 0
                print "0) Ignore file and discard edits."
                for opt in releaseOpts:
                    index += 1
                    print str(index) + ") " + opt

                choice = input("Choice: ")

                if choice == 0:
                    print "You chose to ignore the file."
                else:
                    id3info["album"] = releaseOpts[choice - 1]
                    id3info["tracknumber"] = getTrackNumber(str(int(releases[choice - 1].getElementsByTagName('track-list')[0].attributes.get('offset').value)+1))
            else:
                id3info["album"] = releaseOpts[0]

            print id3info
    except:
        print sys.exc_info()[0]

def move(output, file, dirMatch):
    # Check if there are ID3 tags to begin with, if not, it will complain
    try:
        tag = ID3(file)
    except:
        tag = ID3()
        tag.add(TIT2(encoding=3, text=["Title"]))
        tag.save(file)

    id3info = EasyID3(file)

    try:
        _trackNumber = getTrackNumber(id3info["tracknumber"][0])
        _artist = id3info["artist"][0]
        _title = id3info["title"][0]
        _album = id3info["album"][0]
    except KeyError:
        if dirMatch:
            getID3FromFilename(file, id3info)
            return
        else:
            getID3FromWeb(file, id3info)
            return

    return

    outputDir = output + "/" + _artist + "/" + _album + "/"
    outputFile = _trackNumber + ". " + _artist + " - " + _title + ".mp3"

    if not os.path.exists(outputDir):
        os.makedirs(outputDir)

    shutil.move(file, outputDir + outputFile)

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-d', '--directory', nargs=1, required=True, help='')
    parser.add_argument('-o', '--output', nargs=1, required=True, help='')

    args = parser.parse_args()

    directory = os.path.abspath(args.directory[0])
    output = os.path.abspath(args.output[0])

    dirMatch = (directory == output)

    for root, subFolders, filenames in os.walk(directory):
        for filename in fnmatch.filter(filenames, '*.mp3'):
            move(output, os.path.join(root, filename), dirMatch)

main()
Result:
$ ./mp3-tagger.py -d /UntamedMusic/ -o /Music/
{'album': [u'The Alchemy Index, Volumes III & IV'], 'tracknumber': [u'01'], 'title': [u'Moving Mountains'], 'artist': [u'Thrice']}
{'album': [u'The Alchemy Index, Volumes III & IV'], 'tracknumber': [u'06'], 'title': [u'Child of Dust'], 'artist': [u'Thrice']}
{'album': [u'The Alchemy Index, Volumes III & IV'], 'tracknumber': [u'03'], 'title': [u"The Earth Isn't Humming"], 'artist': [u'Thrice']}
{'album': [u'The Alchemy Index, Volumes III & IV'], 'tracknumber': [u'02'], 'title': [u'Digging My Own Grave'], 'artist': [u'Thrice']}
{'album': [u'The Alchemy Index, Volumes III & IV'], 'tracknumber': [u'04'], 'title': [u'The Lion and the Wolf'], 'artist': [u'Thrice']}
/UntamedMusic/The Alchemy Index Vol. 4 - Earth/Thrice - Come All You Weary.mp3 has multiple options:
0) Ignore file and discard edits.
1) Come All You Weary
2) The Alchemy Index, Volumes III & IV
Choice: 2
{'album': [u'The Alchemy Index, Volumes III & IV'], 'tracknumber': ['05'], 'title': [u'Come All You Weary'], 'artist': [u'Thrice']}
     Now our ID3 data looks exactly how we want it to, so we can move on to the final step of saving the data and moving the file.



; ;