Skip to content

Commit 5a73d16

Browse files
author
Philip Mateescu
committed
Added command line parameters and providing for a way to extend the parser with different exporters
1 parent b8abf09 commit 5a73d16

File tree

8 files changed

+935
-707
lines changed

8 files changed

+935
-707
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Session.vim
2+
discogs_*_*.xml
3+

discogsartistparser.py

Lines changed: 95 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#
88
# This program is distributed in the hope that it will be useful,
99
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1111
# GNU General Public License for more details.
1212
#
1313
# You should have received a copy of the GNU General Public License
@@ -18,143 +18,111 @@
1818
import xml.sax
1919
import os
2020
import sys
21-
import codecs
2221
import model
23-
from postgresexporter import PostgresExporter
24-
import MySQLdb#used to escape strings
25-
import psyco
26-
psyco.full()
22+
#import psyco
23+
#psyco.full()
2724

28-
#artists = {}
2925
artistCounter = 0
3026

31-
class ArtistHandler(xml.sax.handler.ContentHandler):
32-
inElement = {'artists':False,'artist':False,'name':False,'realname':False,'image':False,'images':False,'urls':False,'url':False,'namevariations':False,'aliases':False,'profile':False,'groups':False,'members':False}
33-
artist = None
34-
buffer = ''
35-
36-
def __init__(self):
37-
self.artist = model.Artist()
38-
self.psqlexporter = PostgresExporter()
39-
#self.inElement =
40-
#self.element = {}
41-
42-
def startElement(self, name, attrs):
43-
if not self.inElement.has_key(name):
44-
print "ERROR, UNKOWN ELEMENT!!!"
45-
print name
46-
sys.exit()
47-
self.inElement[name] = True
4827

49-
if name == "artist":
50-
self.artist = model.Artist()
51-
elif name == "image":
52-
self.artist.images.append(model.ImageInfo())
53-
self.artist.images[len(self.artist.images)-1].height = attrs["height"]
54-
self.artist.images[len(self.artist.images)-1].imageType = attrs["type"]
55-
self.artist.images[len(self.artist.images)-1].uri = attrs["uri"]
56-
self.artist.images[len(self.artist.images)-1].uri150 = attrs["uri150"]
57-
self.artist.images[len(self.artist.images)-1].width = attrs["width"]
58-
if len(attrs) != 5:
59-
print "ATTR ERROR"
60-
print attrs
61-
sys.exit()
62-
63-
def characters(self, data):
64-
self.buffer += data
65-
66-
def endDocument(self):
67-
self.psqlexporter.finish()
68-
69-
def endElement(self, name):
70-
self.buffer = self.buffer.strip()
71-
self.buffer = MySQLdb.escape_string(self.buffer)
72-
if name == 'name':
73-
if len(self.buffer) != 0:
74-
if self.inElement['namevariations']:
75-
self.artist.namevariations.append(self.buffer)
76-
elif self.inElement['aliases']:
77-
self.artist.aliases.append(self.buffer)
78-
elif self.inElement['groups']:
79-
self.artist.groups.append(self.buffer)
80-
elif self.inElement['members']:
81-
self.artist.members.append(self.buffer)
82-
else:
83-
self.artist.name = self.buffer
84-
elif name == 'realname':
85-
if len(self.buffer) != 0:
86-
self.artist.realname = self.buffer
87-
elif name == 'profile':
88-
if len(self.buffer) != 0:
89-
self.artist.profile = self.buffer
90-
elif name == 'url':
91-
if len(self.buffer) != 0:
92-
self.artist.urls.append(self.buffer)
93-
'''
94-
if self.buffer.find('wikipedia') != -1:
95-
self.artist.urls['wikipedia'] = self.buffer
96-
elif self.buffer.find('myspace') != -1:
97-
self.artist.urls['myspace'] = self.buffer
98-
else:
99-
self.artist.urls['other'].append(self.buffer)
100-
'''
101-
elif name == "artist":
102-
#global artists
103-
#artists[self.artist.name] = self.artist
28+
class ArtistHandler(xml.sax.handler.ContentHandler):
29+
inElement = {'artists':False,'artist':False,'name':False,'realname':False,'image':False,'images':False,'urls':False,'url':False,'namevariations':False,'aliases':False,'profile':False,'groups':False,'members':False}
30+
artist = None
31+
buffer = ''
10432

105-
global artistCounter
106-
artistCounter += 1
33+
def __init__(self, exporter, stop_after=0):
34+
self.artist = model.Artist()
35+
self.exporter = exporter
36+
#self.inElement =
37+
#self.element = {}
38+
#global options
39+
self.stop_after = stop_after
10740

108-
values = []
109-
values.append(self.artist.name)
110-
columns = "name"
41+
def startElement(self, name, attrs):
42+
if not name in self.inElement:
43+
print "ERROR, UNKOWN ELEMENT!!!"
44+
print name
45+
sys.exit()
46+
self.inElement[name] = True
11147

112-
if len(self.artist.realname) != 0:
113-
values.append(self.artist.realname)
114-
columns += ",realname"
115-
if len(self.artist.profile) != 0:
116-
values.append(self.artist.profile)
117-
columns += ",profile"
118-
if len(self.artist.namevariations) != 0:
119-
values.append(self.artist.namevariations)
120-
columns += ",namevariations"
121-
if len(self.artist.urls) != 0:
122-
values.append(self.artist.urls)
123-
columns += ",urls"
124-
if len(self.artist.aliases) != 0:
125-
values.append(self.artist.aliases)
126-
columns += ",aliases"
127-
if len(self.artist.groups) != 0:
128-
values.append(self.artist.groups)
129-
columns += ",groups"
130-
if len(self.artist.members) != 0:
131-
values.append(self.artist.members)
132-
columns += ",members"
48+
if name == "artist":
49+
self.artist = model.Artist()
50+
elif name == "image":
51+
image = model.ImageInfo()
52+
image.height = attrs["height"]
53+
image.imageType = attrs["type"]
54+
image.uri = attrs["uri"]
55+
image.uri150 = attrs["uri150"]
56+
image.width = attrs["width"]
57+
self.artist.images.append(image)
58+
if len(attrs) != 5:
59+
print "ATTR ERROR"
60+
print attrs
61+
sys.exit()
13362

134-
self.psqlexporter.storeArtist(columns, values, self.artist)
63+
def characters(self, data):
64+
self.buffer += data
13565

136-
print artistCounter
137-
self.buffer = ''
138-
self.inElement[name] = False
66+
def endDocument(self):
67+
self.exporter.finish()
13968

140-
'''
141-
if len(artists) > 100:
142-
for artist in artists:
143-
#print "aRIST+" + artists[artist]
144-
print "name: " + artists[artist].name
145-
print "realname: " + artists[artist].realname
146-
print "namevariations: " + str(artists[artist].namevariations)
147-
print "aliases: " + str(artists[artist].aliases)
148-
print "profile: " + artists[artist].profile
149-
print "urls: " + str(artists[artist].urls)
150-
print "members: " + str(artists[artist].members)
151-
print "groups: " + str(artists[artist].groups)
152-
if len(artists[artist].members) == 0:
153-
print "Not a group"
154-
for img in artists[artist].images:
155-
print "type: " + img.imageType + "size: " + str(img.height) + "x" + str(img.width) + " uri: " + img.uri + " uri150: " + img.uri150
156-
os._exit(0)
157-
#'''
69+
def endElement(self, name):
70+
self.buffer = self.buffer.strip()
71+
if name == 'name':
72+
if len(self.buffer) != 0:
73+
if self.inElement['namevariations']:
74+
self.artist.namevariations.append(self.buffer)
75+
elif self.inElement['aliases']:
76+
self.artist.aliases.append(self.buffer)
77+
elif self.inElement['groups']:
78+
self.artist.groups.append(self.buffer)
79+
elif self.inElement['members']:
80+
self.artist.members.append(self.buffer)
81+
else:
82+
self.artist.name = self.buffer
83+
elif name == 'realname':
84+
if len(self.buffer) != 0:
85+
self.artist.realname = self.buffer
86+
elif name == 'profile':
87+
if len(self.buffer) != 0:
88+
self.artist.profile = self.buffer
89+
elif name == 'url':
90+
if len(self.buffer) != 0:
91+
self.artist.urls.append(self.buffer)
92+
'''
93+
if self.buffer.find('wikipedia') != -1:
94+
self.artist.urls['wikipedia'] = self.buffer
95+
elif self.buffer.find('myspace') != -1:
96+
self.artist.urls['myspace'] = self.buffer
97+
else:
98+
self.artist.urls['other'].append(self.buffer)
99+
'''
100+
elif name == "artist":
158101

102+
self.exporter.storeArtist(self.artist)
103+
global artistCounter
104+
artistCounter += 1
105+
if self.stop_after > 0 and artistCounter >= self.stop_after:
106+
raise model.ParserStopError(artistCounter)
159107

108+
self.buffer = ''
109+
self.inElement[name] = False
160110

111+
'''
112+
if len(artists) > 100:
113+
for artist in artists:
114+
#print "aRIST+" + artists[artist]
115+
print "name: " + artists[artist].name
116+
print "realname: " + artists[artist].realname
117+
print "namevariations: " + str(artists[artist].namevariations)
118+
print "aliases: " + str(artists[artist].aliases)
119+
print "profile: " + artists[artist].profile
120+
print "urls: " + str(artists[artist].urls)
121+
print "members: " + str(artists[artist].members)
122+
print "groups: " + str(artists[artist].groups)
123+
if len(artists[artist].members) == 0:
124+
print "Not a group"
125+
for img in artists[artist].images:
126+
print "type: " + img.imageType + "size: " + str(img.height) + "x" + str(img.width) + " uri: " + img.uri + " uri150: " + img.uri150
127+
os._exit(0)
128+
#'''

0 commit comments

Comments
 (0)