Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adaptation à la base JORF #54

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
.coverage
*.pyc
.tox/
legi.sqlite*
*.sqlite*
/tarballs/
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ exemple avec [cron][cron] :

(`chronic` fait partie des [`moreutils`](http://joeyh.name/code/moreutils/).)

L'option `--base JORF` permet de créer une base JORF au lieu d'une base LEGI.
Noter que l'option `--raw` est obligatoire pour les bases autres que LEGI.

Une fois la base créée, l'option `--base` n'est plus plus nécessaire car sa
valeur est enregistrée dans les métadonnées de la base et est utilisée comme
valeur par défaut. Toutefois, il peut être vérifié que la base à mettre à
jour est du bon type est donnant ce paramètre `--base`.

## Fonctionnalités

### Normalisation des titres et numéros
Expand Down
17 changes: 12 additions & 5 deletions legi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,21 @@

DILA_FTP_HOST = 'echanges.dila.gouv.fr'
DILA_FTP_PORT = 21
DILA_LEGI_DIR = '/LEGI'
DILA_LEGI_DIR = {
'LEGI': '/LEGI',
'JORF': '/JORF',
}


def download_legi(dst_dir):
def download_legi(dst_dir, base='LEGI'):
if not os.path.exists(dst_dir):
os.mkdir(dst_dir)
local_files = {filename: {} for filename in os.listdir(dst_dir)}
ftph = ftplib.FTP()
ftph.connect(DILA_FTP_HOST, DILA_FTP_PORT)
ftph.login()
ftph.cwd(DILA_LEGI_DIR)
remote_files = [filename for filename in ftph.nlst() if '.tar.gz' in filename and ('legi_' in filename or 'LEGI_' in filename)]
ftph.cwd(DILA_LEGI_DIR[base])
remote_files = [filename for filename in ftph.nlst() if '.tar.gz' in filename and (base.lower()+'_' in filename or base+'_' in filename)]
common_files = [f for f in remote_files if f in local_files]
missing_files = [f for f in remote_files if f not in local_files]
remote_files = {filename: {} for filename in remote_files}
Expand Down Expand Up @@ -64,5 +67,9 @@ def download_legi(dst_dir):
if __name__ == '__main__':
p = argparse.ArgumentParser()
p.add_argument('directory')
p.add_argument('--base', default='LEGI')
args = p.parse_args()
download_legi(args.directory)
if args.base not in DILA_LEGI_DIR.keys():
print('!> Non-existing database "'+args.base+'".')
raise SystemExit(1)
download_legi(args.directory, args.base)
14 changes: 7 additions & 7 deletions legi/sql/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ CREATE TABLE textes
CREATE TABLE textes_structs
( id char(20) unique not null
, versions text
, dossier text not null
, dossier text
, cid char(20) not null
, mtime int not null
);
Expand Down Expand Up @@ -49,7 +49,7 @@ CREATE TABLE textes_versions
, nota text
, abro text
, rect text
, dossier text not null
, dossier text
, cid char(20) not null
, mtime int not null
, texte_id int references textes
Expand All @@ -63,7 +63,7 @@ CREATE TABLE sections
, titre_ta text
, commentaire text
, parent char(20) -- REFERENCES sections(id)
, dossier text not null
, dossier text
, cid char(20) not null
, mtime int not null
);
Expand All @@ -78,7 +78,7 @@ CREATE TABLE articles
, type text
, nota text
, bloc_textuel text
, dossier text not null
, dossier text
, cid char(20) not null
, mtime int not null
);
Expand Down Expand Up @@ -114,11 +114,11 @@ CREATE TABLE duplicate_files
( id char(20) not null
, sous_dossier text not null
, cid char(20) not null
, dossier text not null
, dossier text
, mtime int not null
, data text not null
, other_cid char(20) not null
, other_dossier text not null
, other_dossier text
, other_mtime int not null
, UNIQUE (id, sous_dossier, cid, dossier)
);
Expand All @@ -132,7 +132,7 @@ CREATE TABLE textes_versions_brutes
, autorite text
, num text
, date_texte day
, dossier text not null
, dossier text
, cid char(20) not null
, mtime int not null
);
Expand Down
85 changes: 67 additions & 18 deletions legi/tar2sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,15 @@ def scrape_tags(attrs, root, wanted_tags, unwrap=False):
)


def suppress(get_table, db, liste_suppression):
def suppress(base, get_table, db, liste_suppression):
counts = {}
for path in liste_suppression:
parts = path.split('/')
assert parts[0] == 'legi'
text_cid = parts[11]
if parts[0] == 'null':
continue
assert parts[0] == base.lower()
text_id = parts[-1]
text_cid = parts[11] if base == 'LEGI' else text_id
assert len(text_id) == 20
table = get_table(parts)
db.run("""
Expand Down Expand Up @@ -124,7 +126,7 @@ def suppress(get_table, db, liste_suppression):
""", (parts[3], text_cid, text_id))
count(counts, 'delete from duplicate_files', db.changes())
total = sum(counts.values())
print("made", total, "changes in the database based on liste_suppression_legi.dat:",
print("made", total, "changes in the database based on liste_suppression_"+base.lower()+".dat:",
json.dumps(counts, indent=4, sort_keys=True))


Expand Down Expand Up @@ -171,9 +173,14 @@ def process_archive(db, archive_path, process_links=True):
update = db.update

def get_table(parts):
if parts[-1][4:8] not in TABLES_MAP:
return None
table = TABLES_MAP[parts[-1][4:8]]
if table == 'textes_':
table += parts[13] + 's'
if parts[0] == 'legi':
table += parts[13] + 's'
elif parts[0] == 'jorf':
table += parts[3] + 's'
return table

counts = {}
Expand All @@ -183,6 +190,8 @@ def count_one(k):
except KeyError:
counts[k] = 1

base = db.one("SELECT value FROM db_meta WHERE key = 'base'") or 'LEGI'

skipped = 0
unknown_folders = {}
liste_suppression = []
Expand All @@ -193,27 +202,53 @@ def count_one(k):
if path[-1] == '/':
continue
parts = path.split('/')
if parts[-1] == 'liste_suppression_legi.dat':
if parts[-1] == 'liste_suppression_'+base.lower()+'.dat':
liste_suppression += b''.join(entry.get_blocks()).decode('ascii').split()
continue
if parts[1] == 'legi':
if parts[1] == base.lower():
path = path[len(parts[0])+1:]
parts = parts[1:]
if not parts[2].startswith('code_et_TNC_'):
if parts[0] not in ['legi', 'jorf'] or \
(parts[0] == 'legi' and not parts[2].startswith('code_et_TNC_')) or \
(parts[0] == 'jorf' and parts[2] not in ['article', 'section_ta', 'texte']):
# https://github.com/Legilibre/legi.py/issues/23
try:
unknown_folders[parts[2]] += 1
except KeyError:
unknown_folders[parts[2]] = 1
continue
dossier = parts[3]
text_cid = parts[11]
dossier = parts[3] if base == 'LEGI' else None
text_cid = parts[11] if base == 'LEGI' else None
text_id = parts[-1][:-4]
mtime = entry.mtime

# Read the file
xml.feed(b''.join(entry.get_blocks()))
root = xml.close()
tag = root.tag
meta = root.find('META')

# Obtain the CID when database is not LEGI
if base != 'LEGI':
if tag in ['ARTICLE', 'SECTION_TA']:
contexte = root.find('CONTEXTE/TEXTE')
text_cid = attr(contexte, 'cid')
elif tag in ['TEXTELR', 'TEXTE_VERSION']:
meta_spec = meta.find('META_SPEC')
meta_chronicle = meta_spec.find('META_TEXTE_CHRONICLE')
text_cid = meta_chronicle.find('CID').text
else:
raise Exception('unexpected tag: '+tag)

# Skip the file if it hasn't changed, store it if it's a duplicate
duplicate = False
table = get_table(parts)
if table is None:
try:
unknown_folders[text_id] += 1
except KeyError:
unknown_folders[text_id] = 1
continue
prev_row = db.one("""
SELECT mtime, dossier, cid
FROM {0}
Expand Down Expand Up @@ -270,11 +305,6 @@ def count_one(k):
skipped += 1
continue

xml.feed(b''.join(entry.get_blocks()))
root = xml.close()
tag = root.tag
meta = root.find('META')

# Check the ID
if tag == 'SECTION_TA':
assert root.find('ID').text == text_id
Expand Down Expand Up @@ -323,6 +353,9 @@ def count_one(k):
]
elif tag == 'TEXTELR':
assert table == 'textes_structs'
meta_spec = meta.find('META_SPEC')
meta_chronicle = meta_spec.find('META_TEXTE_CHRONICLE')
assert meta_chronicle.find('CID').text == text_cid
scrape_tags(attrs, root, TEXTELR_TAGS)
sommaires = [
{
Expand Down Expand Up @@ -454,7 +487,7 @@ def count_one(k):
print("skipped", x, "files in unknown folder `%s`" % d)

if liste_suppression:
suppress(get_table, db, liste_suppression)
suppress(base, get_table, db, liste_suppression)


def main():
Expand All @@ -467,6 +500,7 @@ def main():
p.add_argument('--pragma', action='append', default=[],
help="Doc: https://www.sqlite.org/pragma.html | Example: journal_mode=WAL")
p.add_argument('--raw', default=False, action='store_true')
p.add_argument('--base', choices=["LEGI", "JORF"])
p.add_argument('--skip-links', default=False, action='store_true',
help="if set, all link metadata will be ignored (the `liens` table will be empty)")
args = p.parse_args()
Expand All @@ -475,7 +509,18 @@ def main():
os.mkdir(args.anomalies_dir)

db = connect_db(args.db, pragmas=args.pragma)
base = db.one("SELECT value FROM db_meta WHERE key = 'base'")
last_update = db.one("SELECT value FROM db_meta WHERE key = 'last_update'")
if not base:
base = args.base.upper() if args.base and not last_update else 'LEGI'
db.insert('db_meta', dict(key='base', value=base))
if args.base and base != args.base.upper():
print('!> Wrong database: requested '+args.base.upper()+' but existing database is '+base+'.')
raise SystemExit(1)

if base != 'LEGI' and args.anomalies:
print("!> The --anomalies option can only be used with the LEGI base")
raise SystemExit(1)

# Check and record the data mode
db_meta_raw = db.one("SELECT value FROM db_meta WHERE key = 'raw'")
Expand All @@ -488,6 +533,10 @@ def main():
if db_meta_raw != args.raw:
db.insert('db_meta', dict(key='raw', value=args.raw), replace=True)

if base != 'LEGI' and not args.raw:
print("!> You need to use the --raw option when working with bases other than LEGI.")
raise SystemExit(1)

# Handle the --skip-links option
has_links = bool(db.one("SELECT 1 FROM liens LIMIT 1"))
if not args.skip_links and not has_links and last_update is not None:
Expand All @@ -499,12 +548,12 @@ def main():

# Look for new archives in the given directory
print("> last_update is", last_update)
archive_re = re.compile(r'(.+_)?legi(?P<global>_global)?_(?P<date>[0-9]{8}-[0-9]{6})\..+', flags=re.IGNORECASE)
archive_re = re.compile(r'(.+_)?'+base.lower()+r'(?P<global>_global)?_(?P<date>[0-9]{8}-[0-9]{6})\..+', flags=re.IGNORECASE)
skipped = 0
archives = sorted([
(m.group('date'), bool(m.group('global')), m.group(0)) for m in [
archive_re.match(fn) for fn in os.listdir(args.directory)
if fnmatch(fn.lower(), '*legi_*.tar.*')
if fnmatch(fn.lower(), '*'+base.lower()+'_*.tar.*')
]
])
most_recent_global = [t[0] for t in archives if t[1]][-1]
Expand Down