-
Notifications
You must be signed in to change notification settings - Fork 19
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adaptation à la base JORF #54
base: master
Are you sure you want to change the base?
Changes from 3 commits
37e8cbd
e37be83
5d2a5d5
669c37a
4f152f1
f5043c6
532d113
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,13 +42,13 @@ def scrape_tags(attrs, root, wanted_tags, unwrap=False): | |
) | ||
|
||
|
||
def suppress(get_table, db, liste_suppression): | ||
def suppress(base, get_table, db, liste_suppression): | ||
counts = {} | ||
for path in liste_suppression: | ||
parts = path.split('/') | ||
assert parts[0] == 'legi' | ||
text_cid = parts[11] | ||
assert parts[0] == base.lower() | ||
text_id = parts[-1] | ||
text_cid = parts[11] if base == 'LEGI' else text_id | ||
assert len(text_id) == 20 | ||
table = get_table(parts) | ||
db.run(""" | ||
|
@@ -124,7 +124,7 @@ def suppress(get_table, db, liste_suppression): | |
""", (parts[3], text_cid, text_id)) | ||
count(counts, 'delete from duplicate_files', db.changes()) | ||
total = sum(counts.values()) | ||
print("made", total, "changes in the database based on liste_suppression_legi.dat:", | ||
print("made", total, "changes in the database based on liste_suppression_"+base.lower()+".dat:", | ||
json.dumps(counts, indent=4, sort_keys=True)) | ||
|
||
|
||
|
@@ -171,9 +171,14 @@ def process_archive(db, archive_path, process_links=True): | |
update = db.update | ||
|
||
def get_table(parts): | ||
if parts[-1][4:8] not in TABLES_MAP: | ||
return None | ||
table = TABLES_MAP[parts[-1][4:8]] | ||
if table == 'textes_': | ||
table += parts[13] + 's' | ||
if parts[0] == 'legi': | ||
table += parts[13] + 's' | ||
elif parts[0] == 'jorf': | ||
table += parts[3] + 's' | ||
return table | ||
|
||
counts = {} | ||
|
@@ -183,6 +188,8 @@ def count_one(k): | |
except KeyError: | ||
counts[k] = 1 | ||
|
||
base = db.one("SELECT value FROM db_meta WHERE key = 'base'") or 'LEGI' | ||
|
||
skipped = 0 | ||
unknown_folders = {} | ||
liste_suppression = [] | ||
|
@@ -193,27 +200,53 @@ def count_one(k): | |
if path[-1] == '/': | ||
continue | ||
parts = path.split('/') | ||
if parts[-1] == 'liste_suppression_legi.dat': | ||
if parts[-1] == 'liste_suppression_'+base.lower()+'.dat': | ||
liste_suppression += b''.join(entry.get_blocks()).decode('ascii').split() | ||
continue | ||
if parts[1] == 'legi': | ||
if parts[1] == base.lower(): | ||
path = path[len(parts[0])+1:] | ||
parts = parts[1:] | ||
if not parts[2].startswith('code_et_TNC_'): | ||
if parts[0] not in ['legi', 'jorf'] or \ | ||
(parts[0] == 'legi' and not parts[2].startswith('code_et_TNC_')) or \ | ||
(parts[0] == 'jorf' and parts[2] not in ['article', 'section_ta', 'texte']): | ||
# https://github.com/Legilibre/legi.py/issues/23 | ||
try: | ||
unknown_folders[parts[2]] += 1 | ||
except KeyError: | ||
unknown_folders[parts[2]] = 1 | ||
continue | ||
dossier = parts[3] | ||
text_cid = parts[11] | ||
dossier = parts[3] if parts[0] == 'legi' else 'jorf' | ||
text_cid = parts[11] if parts[0] == 'legi' else None | ||
text_id = parts[-1][:-4] | ||
mtime = entry.mtime | ||
|
||
# Read the file | ||
xml.feed(b''.join(entry.get_blocks())) | ||
root = xml.close() | ||
tag = root.tag | ||
meta = root.find('META') | ||
|
||
# Obtain the CID when database is not LEGI | ||
if base != 'LEGI': | ||
if tag in ['ARTICLE', 'SECTION_TA']: | ||
contexte = root.find('CONTEXTE/TEXTE') | ||
text_cid = attr(contexte, 'cid') | ||
elif tag in ['TEXTELR', 'TEXTE_VERSION']: | ||
meta_spec = meta.find('META_SPEC') | ||
meta_chronicle = meta_spec.find('META_TEXTE_CHRONICLE') | ||
text_cid = meta_chronicle.find('CID').text | ||
else: | ||
raise Exception('unexpected tag: '+tag) | ||
|
||
# Skip the file if it hasn't changed, store it if it's a duplicate | ||
duplicate = False | ||
table = get_table(parts) | ||
if table is None: | ||
try: | ||
unknown_folders[text_id] += 1 | ||
except KeyError: | ||
unknown_folders[text_id] = 1 | ||
continue | ||
prev_row = db.one(""" | ||
SELECT mtime, dossier, cid | ||
FROM {0} | ||
|
@@ -270,11 +303,6 @@ def count_one(k): | |
skipped += 1 | ||
continue | ||
|
||
xml.feed(b''.join(entry.get_blocks())) | ||
root = xml.close() | ||
tag = root.tag | ||
meta = root.find('META') | ||
|
||
# Check the ID | ||
if tag == 'SECTION_TA': | ||
assert root.find('ID').text == text_id | ||
|
@@ -323,6 +351,9 @@ def count_one(k): | |
] | ||
elif tag == 'TEXTELR': | ||
assert table == 'textes_structs' | ||
meta_spec = meta.find('META_SPEC') | ||
meta_chronicle = meta_spec.find('META_TEXTE_CHRONICLE') | ||
assert meta_chronicle.find('CID').text == text_cid | ||
scrape_tags(attrs, root, TEXTELR_TAGS) | ||
sommaires = [ | ||
{ | ||
|
@@ -454,7 +485,7 @@ def count_one(k): | |
print("skipped", x, "files in unknown folder `%s`" % d) | ||
|
||
if liste_suppression: | ||
suppress(get_table, db, liste_suppression) | ||
suppress(base, get_table, db, liste_suppression) | ||
|
||
|
||
def main(): | ||
|
@@ -467,6 +498,7 @@ def main(): | |
p.add_argument('--pragma', action='append', default=[], | ||
help="Doc: https://www.sqlite.org/pragma.html | Example: journal_mode=WAL") | ||
p.add_argument('--raw', default=False, action='store_true') | ||
p.add_argument('--base') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. le défaut devrait être There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. C’est ce que j’avais fait dans une première version, puis j’ai remplacé par les lignes 510-517 ci-après pour prioriser la valeur db_meta→base (qui elle-même vaut
Si on met une valeur par défaut au paramètre --base, on ne peut plus avoir ce double comportement de vérification ou non et il faudrait toujours ajouter le paramètre --base avec la bone valeur. Ça complexifierait par exemple si on veut faire une boucle bash qui mettrait à jour plusieurs bases SQLite qui piocheraient dans un même dossier contenant les .tar.gz de toutes les bases. Dans le cas d’une nouvelle base, la rédaction existante est l’équivalent d’un paramètre par défaut qui vaudrait |
||
p.add_argument('--skip-links', default=False, action='store_true', | ||
help="if set, all link metadata will be ignored (the `liens` table will be empty)") | ||
args = p.parse_args() | ||
|
@@ -475,7 +507,14 @@ def main(): | |
os.mkdir(args.anomalies_dir) | ||
|
||
db = connect_db(args.db, pragmas=args.pragma) | ||
base = db.one("SELECT value FROM db_meta WHERE key = 'base'") | ||
last_update = db.one("SELECT value FROM db_meta WHERE key = 'last_update'") | ||
if not base: | ||
base = args.base.upper() if args.base and not last_update else 'LEGI' | ||
db.insert('db_meta', dict(key='base', value=base)) | ||
if args.base and base != args.base.upper(): | ||
print('!> Wrong database: requested '+args.base.upper()+' but existing database is '+base+'.') | ||
raise SystemExit(1) | ||
|
||
# Check and record the data mode | ||
db_meta_raw = db.one("SELECT value FROM db_meta WHERE key = 'raw'") | ||
|
@@ -499,12 +538,12 @@ def main(): | |
|
||
# Look for new archives in the given directory | ||
print("> last_update is", last_update) | ||
archive_re = re.compile(r'(.+_)?legi(?P<global>_global)?_(?P<date>[0-9]{8}-[0-9]{6})\..+', flags=re.IGNORECASE) | ||
archive_re = re.compile(r'(.+_)?'+base.lower()+r'(?P<global>_global)?_(?P<date>[0-9]{8}-[0-9]{6})\..+', flags=re.IGNORECASE) | ||
skipped = 0 | ||
archives = sorted([ | ||
(m.group('date'), bool(m.group('global')), m.group(0)) for m in [ | ||
archive_re.match(fn) for fn in os.listdir(args.directory) | ||
if fnmatch(fn.lower(), '*legi_*.tar.*') | ||
if fnmatch(fn.lower(), '*'+base.lower()+'_*.tar.*') | ||
] | ||
]) | ||
most_recent_global = [t[0] for t in archives if t[1]][-1] | ||
|
@@ -532,13 +571,13 @@ def main(): | |
print('last_update is now set to', last_update) | ||
|
||
# Detect anomalies if requested | ||
if args.anomalies: | ||
if args.anomalies and base == 'LEGI': | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ça me semblerait plus propre de lever une erreur si les deux options sont utilisées conjointement There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Effectivement. |
||
fpath = args.anomalies_dir + '/anomalies-' + last_update + '.txt' | ||
with open(fpath, 'w') as f: | ||
n_anomalies = detect_anomalies(db, f) | ||
print("logged", n_anomalies, "anomalies in", fpath) | ||
|
||
if not args.raw: | ||
if not args.raw and base == 'LEGI': | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pareil, ça me semblerait plus propre de forcer l'utilisateur à utiliser l'option There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pourquoi pas, mais dans ce cas il faut documenter sur le README.md cette obligation pour ces bases. Et de plus, similairement à l’option --base qui est stockée dans db_meta→base, ça pourrait être plus facile d’utilisation pour les mises à jour que de stocker cette valeur dans db_meta. Ainsi, legi.py appliquerait toujours les mêmes options lors des mises à jour. Mais ceci serait une autre PR. |
||
from .normalize import normalize_text_titles | ||
normalize_text_titles(db) | ||
from .factorize import main as factorize | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
je crois que la notion de dossier n'existe pas pour les bases autres que LEGI. ça serait plus propre de stocker
None
, non ?Cela implique de changer le schema mais c'est mieux comme ça à mon avis. cf ma PR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Je m’étais posé la question, mais oui, effectivement, utiliser
None
fait sens. J’avais mis'jorf'
si jamais une base legi.py contient plusieurs bases (LEGI, JORF, …) mais il serait plus propre d’ajouter une colonne ou d’avoir recours à des tables séparées (par ex. legi_textes_versions) dans cette éventualité.D’après cette page, il y a 3-4 autres bases avec des dossiers (CASS, INCA, JADE, voire ACCO même si le dossier "bureautique" est complètement hors schéma standard).