forked from rgaudin/library-to-catalog
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlibrary-to-catalog.py
executable file
·273 lines (214 loc) · 8.26 KB
/
library-to-catalog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu
from __future__ import (unicode_literals, absolute_import,
division, print_function)
import re
import os
import sys
import logging
import hashlib
import xml.etree.ElementTree as ET
import yaml
import requests
import pycountry
try:
text_type = unicode # Python 2
from urlparse import urlparse, urljoin
except NameError:
text_type = str # Python 3
from urllib.parse import urlparse, urljoin
LABELS = {
'nopic': "NO PICTURE",
'novid': "NO VIDEO",
'_ftindex': "Full-text index",
'nodet': "NO DET",
}
logging.basicConfig()
logger = logging.getLogger("library2yaml")
logger.setLevel(logging.DEBUG)
with_checksum = True # debug only
with_size = True # debug only
use_zip = False
def get_remote_checksum(url):
''' retrieve SHA-256 checksum of download using special mirrorbrain url '''
req = requests.get("{url}.sha256".format(url=url))
return req.text.split()[0]
def get_remote_checksum_from_header(url):
''' retrieves SHA-256 checksum of url from its headers (mirrorbrain)
not using because it is encoded (looks like base64 but it's not) '''
req = requests.head(url)
for digest_line in req.headers['Digest'].split():
if digest_line.startswith("SHA-256"):
return digest_line.split("=", 1)[-1]
def get_local_checksum(fpath):
''' calculate SHA-256 checksum from a local file '''
hash_sha = hashlib.sha256()
with open(fpath, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_sha.update(chunk)
return hash_sha.hexdigest()
def get_remote_size(url):
''' retrieves file size in bytes from its headers '''
req = requests.head(url, allow_redirects=True)
return int(req.headers['Content-Length'])
def get_local_size(fpath):
''' size in bytes of a local file '''
return os.path.getsize(fpath)
def get_zip_url(url):
''' convert .zim.meta4 url to .zip one '''
purl = urlparse(url)
path = purl.path
fname = os.path.basename(url)
dirent = path[:len(path) - len(fname)]
new_fname = "kiwix-0.9+{fname}".format(
fname=re.sub(r'\.zim\.meta4$', '.zip', fname))
new_dirent = dirent.replace("/zim/", "/portable/")
new_path = "{dirent}{fname}".format(dirent=new_dirent, fname=new_fname)
return urljoin(url, new_path)
def get_zim_url(url):
''' convert .zim.meta4 url to .zim one '''
purl = urlparse(url)
path = purl.path
fname = os.path.basename(url)
dirent = path[:len(path) - len(fname)]
new_fname = re.sub(r'\.zim\.meta4$', '.zim', fname)
new_path = "{dirent}{fname}".format(dirent=dirent, fname=new_fname)
return urljoin(url, new_path)
def clean(text):
''' normalize free text input from library (titles, descriptions) '''
if text is None:
return None
return " ".join(text.splitlines())
def get_attr(book, key, t=text_type):
''' attribute accessor for XML element '''
value = book.attrib.get(key)
if value is None:
return None
return t(value)
def get_kind(book):
''' main descriptor of conent (wikipedia, wiktionary, etc)
derivate from fname '''
fname = os.path.basename(get_attr(book, 'url'))
return re.sub(r'_20[0-9]{2}\-[0-1][0-9]\.zim\.meta4$', '', fname)
def read_tags(book):
''' cleaned-up list of tags from XML book '''
return list(set([tag.replace('"', '').replace('=', '')
for tag in (get_attr(book, 'tags') or "").split(';')]))
def get_tags_label(tags):
''' label describing important tags features '''
if not tags:
return None
labels = [LABELS.get(tag) for tag in tags if tag in LABELS.keys()]
if not labels:
return None
return ", ".join(labels)
def convert(library_fpath, catalog_fpath,
format='zim', local_repository=False):
logger.info("starting convertion of `{}` to `{}`"
.format(library_fpath, catalog_fpath))
if not os.path.exists(library_fpath):
raise IOError("missing library file `{}`".format(library_fpath))
tree = ET.parse(library_fpath)
root = tree.getroot()
logger.info("parsing xml library file `{}`".format(library_fpath))
catalog = {}
for book in root.iter('book'):
def ga(k, t=text_type):
return get_attr(book, k, t)
# copy data from attributes
version = ga('date')
name = clean(ga('title'))
description = clean(ga('description'))
meta_url = ga('url') # URL to zim.meta4 file
bid = ga('id') or "none_not-found-in-library.xml" # unique book ID
# update title for tags
tags = read_tags(book)
tags_label = get_tags_label(tags)
if tags_label is not None:
name = "{name} [{label}]".format(name=name, label=tags_label)
# language is used in both ISO 639-3 (pol) and ISO 639-1 (pl)
lang_3 = ga('language') # xml contains ISO 639-3
# skip content without a language code
# old 2009 ubuntudoc
# wikipedia ml)
if lang_3 is None:
continue
# find ISO 639-1 from ISO 639-3 (might not exist)
if re.search(r'[^a-z]', lang_3) is not None:
lang_3 = re.split(r'[^a-z]', lang_3, 1)[0]
try:
lang_1 = pycountry.languages.get(**{
'alpha_{}'.format(len(lang_3)): lang_3}).alpha_2
except (AttributeError, KeyError):
lang_1 = lang_3
# url to final content file
url = get_zip_url(meta_url) if use_zip else get_zim_url(meta_url)
# size and checksums are either captured over network or using
# a local copy of all the files.
if local_repository:
full_path = urlparse(url).path
fname = os.path.basename(url)
short_path = full_path[:-len(fname)]
fpath = os.path.join(local_repository, short_path[1:], fname)
else:
full_path = fname = short_path = None
size = None
if with_size:
# size in library is approx in KB. ideascube wants accurate bytes
if local_repository and os.path.exists(fpath):
size = get_local_size(fpath)
if size is None:
logger.debug("fetching size for {}".format(url))
size = get_remote_size(url)
else:
size = ga('size', int) * 1024
# catalog expects an SHA256 checksum of the content
sha256sum = None
if with_checksum:
if local_repository and os.path.exists(fpath):
logger.debug("calculating local checksum for {}".format(fpath))
sha256sum = get_local_checksum(fpath)
if sha256sum is None:
logger.debug("fetching checkum for {}".format(url))
sha256sum = get_remote_checksum(url)
# catalog expects either zipped-zim or zim
btype = "zipped-zim" if use_zip else "zim"
# identifiers
kind = get_kind(book)
if lang_1 is not None:
langid = "{kind}.{lang}".format(kind=kind, lang=lang_1)
else:
langid = kind
catalog[langid] = {
'name': name,
'description': description,
'version': version,
'language': lang_3,
'id': bid,
'url': url,
'size': size,
'sha256sum': sha256sum,
'type': btype,
'langid': langid,
}
logger.info("finished parsing {nb} entries".format(nb=len(catalog.keys())))
logger.info("dumping yaml content to file `{}`".format(catalog_fpath))
with open(catalog_fpath, 'w') as fd:
yaml.safe_dump({'all': catalog}, fd,
default_flow_style=False,
allow_unicode=True,
encoding="utf-8")
logger.info("done converting library to catalog.")
if __name__ == '__main__':
if not len(sys.argv[1:]) == 2:
logger.error("Usage: {} library.xml catalog.yml [zim|zip]"
.format(sys.argv[0]))
sys.exit(1)
try:
convert(*sys.argv[1:])
except Exception as exp:
logger.critical("Unhandled exception raised during convertion.")
logger.exception(exp)
logger.error("library file was not converted.")
sys.exit(1)