-
Notifications
You must be signed in to change notification settings - Fork 1
/
index_database.py
159 lines (134 loc) · 7.15 KB
/
index_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env
import pywikibot, sqlite3, re, sys, requests, time, os
from pywikibot import pagegenerators
from ksamsok import KSamsok
class Database:
def __init__(self):
# load query files
with open('churches.rq', 'r') as sparql_file:
# remove comments and line-breaks
self.sparql = re.sub(r'(#(()|(.+))\n)|(\n)', '', sparql_file.read())
with open('create_table.sql', 'r') as sql_table_file:
self.table_sql = sql_table_file.read()
# remove existing database
try:
os.remove('db.sqlite')
except OSError as e:
if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
raise # re-raise exception if a different error occured
# create and connect to db
self.db_connection = sqlite3.connect('db.sqlite')
self.c = self.db_connection.cursor()
# create table
self.c.execute(self.table_sql)
self.db_connection.commit()
# fetch data and build database
self.index()
# close database connection
self.db_connection.close()
def index(self):
# setup pywikibot and initialize generator
pywikibot.handle_args(sys.argv[1:])
site = pywikibot.Site()
generator = pagegenerators.WikidataSPARQLPageGenerator(self.sparql, site)
# setup instance of the KSamsok class
# the api key is never used so no need to use another one
soch = KSamsok('test')
for i in generator:
item = i.get()
data = {}
# get the raw wikidata uri without Q
data['wikidata'] = re.sub(r'(?!\d).', '', str(i))
# make sure the item does not exist in our database
if not self.primary_key_exists(data['wikidata']):
# parse the kulturarvsdata uri or set to false if invalid
data['kulturarvsdata'] = soch.formatUri(item['claims']['P1260'][0].getTarget(), 'raw', True)
#TODO make a log of items with broken kulturarvsdata uris
if data['kulturarvsdata']:
# fetch stuff from the wikidata item
data['wikipedia'] = item['sitelinks']['svwiki']
try:
data['commons'] = item['claims']['P373'][0].getTarget()
except(KeyError):
data['commons'] = ''
try:
data['image'] = re.sub(r'\]\]', '', re.sub(r'\[\[commons:', '', str(item['claims']['P18'][0].getTarget())))
except(KeyError):
data['image'] = ''
coord_pair = item['claims']['P625'][0].getTarget()
data['lat'] = coord_pair.lat
data['lon'] = coord_pair.lon
data['label'] = item['labels']['sv']
# fetch stuff from kulturarvsdata
record = soch.getObject(data['kulturarvsdata'])
if record['presentation']['description']:
# if the string is too short to be useful drop it
if len(record['presentation']['description']) > 30:
data['description'] = record['presentation']['description']
else:
data['description'] = ''
else:
data['description'] = ''
# fetch intro paragraphs from wikipedia
#TODO if the connection to wikipedia fails then
# the item should be dropped(may need to be refactored)
try:
r = requests.get('https://sv.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro=&explaintext=&titles=' + data['wikipedia'])
result = r.json()
# the page id is the unknown key the loop is for figuring it out
# dictionaries does not support indexing as they are unsorted...
for key in result['query']['pages']:
data['wp_description'] = result['query']['pages'][key]['extract']
except(KeyError):
data['wp_description'] = ''
try:
if (data['image'] != ''):
r = requests.get('https://commons.wikimedia.org/w/api.php?action=query&format=json&prop=pageimages&piprop=thumbnail|name|original&pithumbsize=110&titles=File:' + data['image'])
result = r.json()
for key in result['query']['pages']:
data['image_thumbnail'] = result['query']['pages'][key]['thumbnail']['source']
data['image_original'] = result['query']['pages'][key]['thumbnail']['original']
else:
data['image_thumbnail'] = ''
data['image_original'] = ''
except(KeyError):
data['image_thumbnail'] = ''
data['image_original'] = ''
# write and commit church to db
self.c.execute('''INSERT INTO `churches` (
`wikidata`,
`label`,
`kulturarvsdata`,
`description`,
`lat`,
`lon`,
`wikipedia`,
`wp_description`,
`commons`,
`image`,
`image_thumbnail`,
`image_original`
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
(data['wikidata'],
data['label'],
data['kulturarvsdata'],
data['description'],
data['lat'],
data['lon'],
data['wikipedia'],
data['wp_description'],
data['commons'],
data['image'],
data['image_thumbnail'],
data['image_original']))
self.db_connection.commit()
def primary_key_exists(self, key):
self.c.execute('SELECT `wikidata` FROM `churches` WHERE `wikidata` = ' + key)
if self.c.fetchone():
return True
else:
return False
t0 = time.time()
database = Database()
t1 = time.time()
print('Done! in ' + str((t1 - t0) / 60) + ' minutes')