-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathuri-def.py
352 lines (316 loc) · 15 KB
/
uri-def.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
import re
from sys import argv, stderr
from os.path import isfile, isdir, exists, dirname, join, basename
from os import makedirs
from subprocess import run
from shutil import copyfile
from glob import glob
def get_paths():
"""Parses command-line arguments, if present; else uses defaults"""
spec = [_ for _ in argv[1:] if isfile(_)]
dest = [_ for _ in argv[1:] if isdir(_)]
if len(dest) != 1: raise Exception("One destination expected, but got "+str(dest))
if len(spec) < 1: raise Exception("Expected input files, none given")
return spec, dest[0]
def get_text(spec):
"""Reads the contents of the given file"""
return '\n\n'.join(open(s).read() for s in spec)
def get_prefixes(txt):
"""Find and parse prefix definition tables"""
pfx = {}
for pfxtable in re.finditer(r'([^\n]*)Short Prefix *\| *URI Prefix *\|(\s*\|[^\n]*)*', txt):
for abbr, lng in re.findall(r'`([^`]*)` *\| *`([^`]*)`', pfxtable.group(0)):
pfx[abbr] = lng
return pfx
def find_datatypes(txt, g7):
"""Returns datatype:uri and adds URI suffixes to g7"""
dturi = {}
for section in re.finditer(r'^#+ *([^\n]*)\n+((?:[^\n]|\n+[^\n#])*[^\n]*URI for[^\n]*datatypes? is(?:[^\n]|\n+[^\n#])*)', txt, re.M):
for dt, uri in re.findall(r'URI[^\n]*`([^\n`]*)` datatype[^\n]*`([^`\n:]*:[^\n`]*)`', section.group(0)):
dturi[dt] = uri
if uri.startswith('g7:'):
if '#' in uri: uri = uri[:uri.find('#')]
if uri[3:] not in g7:
g7[uri[3:]] = ('datatype', [section.group(2).strip()])
return dturi
def find_cat_tables(txt, g7, tagsets):
"""Looks for tables of tags preceded by a concatenation-based URI
Raises an exception if any URI is repeated with distinct definitions. This code contains a hard-coded fix for BIRTH which has the same unifying concept but distinct text in the spec.
Returns a {structure:[list,of,allowed,enums]} mapping
"""
hard_code = {
"g7:enum-BIRTH": 'Associated with birth, such as a birth name or birth parents.',
}
cats = {}
enums = {}
for bit in re.finditer(r'by\s+concatenating\s+`([^`]*)`', txt):
i = txt.rfind('\n#', 0, bit.start())
j = txt.find(' ',i)
j = txt.find(txt[i:j+1], j)
sect = txt[i:j].replace('(Latter-Day Saint Ordinance)','`ord`') ## <- hack for ord-STAT
for entry in re.finditer(r'`([A-Z0-9_]+)` *\| *(.*?) *[|\n]', sect):
enum, meaning = entry.groups()
pfx = bit.group(1)+enum
if 'The URI of this' in meaning:
meaning, tail = meaning.split('The URI of this')
pfx = tail.split('`')[1]
meaning = hard_code.get(pfx,meaning)
if pfx in cats and meaning != cats[pfx]:
raise Exception('Concatenated URI '+pfx+' has multiple definitions:'
+ '\n '+cats[pfx]
+ '\n '+meaning
)
if 'enum-' in pfx:
k1 = sect.find('`', sect.rfind('\n#', 0, entry.start()))
k2 = sect.rfind('`', 0, sect.find('\n', k1))
key = sect[k1:k2].replace('`','').replace('.','-')
enums.setdefault(key,[]).append(pfx)
if pfx not in cats:
cats[pfx] = meaning
if pfx.startswith('g7:'):
if pfx[3:] in g7:
raise Exception(pfx+' defined as an enumeration and a '+g7[pfx[3:]][0])
g7[pfx[3:]] = ('enumeration', [meaning])
return enums
def find_calendars(txt, g7):
"""Looks for sections defining a `g7:cal-` URI"""
for bit in re.finditer(r'#+ `[^`]*`[^\n]*\n+((?:\n+(?!#)|[^\n])*is `g7:(cal-[^`]*)`(?:\n+(?!#)|[^\n#])*)', txt):
g7[bit.group(2)] = ('calendar',[bit.group(1)])
def joint_card(c1,c2):
"""Given two cardinalities, combine them."""
return '{' + ('1' if c1[1] == c2[1] == '1' else '0') + ':' + ('1' if c1[3] == c2[3] == '1' else 'M') + '}'
def parse_rules(txt):
"""returns {rule:[(card,uri),(card,uir),...] for each level-n
production of the rule, even if indirect (via another rule),
regardless of if alternation or set."""
# Find gedstruct context
rule_becomes = {}
rule_becomes_rule = {}
for rule,block,notes in re.findall(r'# *`([A-Z_0-9]+)` *:=\s+```+[^\n]*\n([^`]*)``+[^\n]+((?:[^\n]|\n(?!#))*)', txt):
for card, uri in re.findall(r'^n [A-Z@][^\n]*(\{.:.\}) *(\S+:\S+)', block, re.M):
rule_becomes.setdefault(rule,[]).append((card, uri))
for r2, card in re.findall(r'^n <<([^>]*)>>[^\n]*(\{.:.\})', block, re.M):
rule_becomes_rule.setdefault(rule,[]).append((card, r2))
# Fixed-point rule-to-rule resolution
again = True
while again:
again = False
for r1,rset in tuple(rule_becomes_rule.items()):
flat = True
for c,r2 in rset:
if r2 in rule_becomes_rule:
flat = False
if flat:
for c,r2 in rset:
rule_becomes.setdefault(r1,[]).extend((joint_card(c,c2),uri) for (c2,uri) in rule_becomes[r2])
del rule_becomes_rule[r1]
else:
again = True
return rule_becomes
def new_key(val, d, *keys, msg=''):
"""Helper method to add to a (nested) dict and raise if present"""
for k in keys[:-1]:
d = d.setdefault(k, {})
if keys[-1] in d:
if d[keys[-1]] != val:
raise Exception(msg+'Duplicate key: '+str(keys))
else: d[keys[-1]] = val
def parse_gedstruct(txt, rules, dtypes):
"""Reads through all gedstruct blocks to find payloads, substructures, and superstructures"""
sup,sub,payload = {}, {}, {}
for block in re.findall(r'```[^\n]*gedstruct[^\n]*\n([^`]*)\n```', txt):
stack = []
for line in block.split('\n'):
parts = line.strip().split()
if len(parts) < 3:
if line not in ('[','|',']'):
raise Exception('Invalid gedstruct line: '+repr(line))
continue
if parts[1].startswith('@'): del parts[1]
if parts[0] == 'n': stack = []
else:
n = int(parts[0])
while n < len(stack): stack.pop()
if parts[1].startswith('<'):
card = parts[2]
if len(stack):
for c,u in rules[parts[1][2:-2]]:
new_key(joint_card(card,c), sup, u, stack[-1], msg='rule sup: ')
new_key(joint_card(card,c), sub, stack[-1], u, msg='rule sub: ')
else:
uri = parts[-1]
if '{' in uri:
uri = parts[1]+' pseudostructure'
card = parts[-2]
if len(parts) > 4:
p = ' '.join(parts[2:-2])[1:-1]
if p.startswith('<XREF:'): p = '@'+p+'@'
elif p == 'Y|<NULL>': pass
else: p = dtypes[p]
else: p = None
new_key(p, payload, uri, msg='payload: ')
if len(stack):
new_key(card, sup, uri, stack[-1], msg='line sup: ')
new_key(card, sub, stack[-1], uri, msg='line sub: ')
stack.append(uri)
return {k:{'sub':sub.get(k,[]),'sup':sup.get(k,[]),'pay':payload.get(k)} for k in sub.keys()|sup.keys()|payload.keys()}
def find_descriptions(txt, g7, ssp):
"""Collects structure definitions as follows:
- Sections '#+ TAG (Name) `g7:FULL.TAG`'
- Sections '#+ `RULE` :=' with only one level-n struct
- Rows in tables 'Tag | Name<br/>URI | Description'
Returns a {section header:[list,of,uris]} mapping
"""
# structure sections
for name,uri,desc in re.findall(r'#+ `[^`]*`[^\n]*\(([^)]*)\)[^\n]*`([^:`\n]*:[^`\n]*)`[^\n]*\n+((?:\n+(?!#)|[^\n])*)', txt):
if uri not in ssp:
raise Exception('Found section for '+uri+' but no gedstruct')
if uri.startswith('g7:'):
g7.setdefault(uri[3:],('structure',[],ssp[uri]))[1].extend((
name.strip(),
desc.strip()
))
for other in re.findall(r'[Aa] type of `(\S*)`', desc):
m = re.search('^#+ +`'+other+r'`[^\n`]*\n((?:[^\n]+|\n+(?!#))*)', txt, re.M)
if m:
g7[uri[3:]][1].append(m.group(1).strip())
# error check that gedstruct and sections align
for uri in ssp:
if 'pseudostructure' in uri: continue
if uri.startswith('g7:') and uri[3:] not in g7:
raise Exception('Found gedstruct for '+uri+' but no section')
# gedstruct sections
for uri, desc in re.findall(r'#+ *`[^`]*` *:=[^\n]*\n+`+[^\n]*\n+n [^\n]*\} *(\S+:\S+) *(?:\n [^\n]*)*\n`+[^\n]*\n+((?:[^\n]|\n(?!#))*)', txt):
g7[uri[3:]][1].append(desc.strip())
tagsets = {}
# tag tables
for table in re.finditer(r'\n#+ (\S[-A-Za-z0-9 ]*[a-z0-9])[^#]*?Tag *\| *Name[^|\n]*\| *Description[^\n]*((?:\n[^\n|]*\|[^\n|]*\|[^\n]*)*)', txt):
pfx = ''
header = table.group(1)
if header.startswith('Fam'): pfx = 'FAM-'
if header.startswith('Indi'): pfx = 'INDI-'
for tag, name, desc in re.findall(r'`([A-Z_0-9]+)` *\| *([^|\n]*?) *\| *([^|\n]*[^ |\n]) *', table.group(2)):
if '<br' in name: name = name[:name.find('<br')]
if tag not in g7: tag = pfx+tag
if tag not in g7:
raise Exception('Found table for '+tag+' but no section or structure')
if g7[tag][0] != 'structure':
raise Exception('Found table for '+tag+' but that\'s a '+g7[tag][0]+' not a structure')
tagsets.setdefault(header,[]).append(tag)
g7[tag][1].append(name.strip())
g7[tag][1].append(desc.strip())
return tagsets
def find_enum_by_link(txt, enums, tagsets):
"""Extend enums with the tagsets suggested by any section with #enum- in the header that lacks a table and links to Events or Attributes"""
for sect in re.finditer(r'# *`([A-Z0-9_`.]*)`[^\n]*#enum-[\s\S]*?\n#', txt):
if '[Events]' in sect.group(0):
key = sect.group(1).replace('`','').replace('.','-')
for k in tagsets:
if 'Event' in k:
enums.setdefault(key, []).extend('g7:'+_ for _ in tagsets[k])
if '[Attributes]' in sect.group(0):
key = sect.group(1).replace('`','').replace('.','-')
for k in tagsets:
if 'Attribute' in k:
enums.setdefault(key, []).extend('g7:'+_ for _ in tagsets[k])
def tidy_markdown(md, indent, width=79):
"""Run markdown through pandoc to remove markup and wrap columns"""
global prefixes
for k,v in prefixes.items():
md = re.sub(r'\b'+k+':', v, md)
out = run(['pandoc','-t','plain','--columns='+str(width-indent)], input=md.encode('utf-8'), capture_output=True)
return out.stdout.rstrip().decode('utf-8').replace('\n','\n'+' '*indent)
def yaml_str_helper(pfx, md, width=79):
txt = tidy_markdown(md, len(pfx), width)
if ('\n'+' '*len(pfx)+'\n') in txt: return pfx + '|\n' + ' '*len(pfx) + txt
return pfx + txt
def expand_prefix(txt, prefixes):
for key in sorted(prefixes.keys(), key=lambda x:-len(x)):
k = key+':'
if txt.startswith(k):
return prefixes[key] + txt[len(k):]
return txt
if __name__ == '__main__':
# URI definitions
g7 = {}
specs, dest = get_paths()
txt = get_text(specs)
prefixes = get_prefixes(txt)
dtypes = find_datatypes(txt, g7)
rules = parse_rules(txt)
ssp = parse_gedstruct(txt, rules, dtypes)
tagsets = find_descriptions(txt, g7, ssp)
enums = find_cat_tables(txt, g7, tagsets)
find_enum_by_link(txt, enums, tagsets)
find_calendars(txt, g7)
struct_lookup = []
enum_lookup = []
payload_lookup = []
cardinality_lookup = []
for tag in g7:
print('outputting', tag, '...', end=' ')
maybe = join(dirname(specs[0]),'terms',tag)
if exists(maybe):
copyfile(maybe, join(dest,tag))
print('by copying', maybe, '...', end=' ')
continue
with open(join(dest,tag), 'w') as fh:
fh.write('%YAML 1.2\n---\n')
print('type:',g7[tag][0], file=fh)
# error: type-DATE# type-List#
uri = expand_prefix('g7:'+tag,prefixes)
print('\nuri:', uri, file=fh)
if g7[tag][0] in ('structure', 'enumeration', 'calendar', 'month'):
ptag = re.sub(r'.*-', '', tag)
print('\nstandard tag: '+ptag, file=fh)
print('\ndescriptions:', file=fh)
for desc in g7[tag][1]:
print(yaml_str_helper(' - ', desc), file=fh)
if g7[tag][0] == 'structure':
d = g7[tag][2]
payload = expand_prefix(d['pay'],prefixes) if d['pay'] is not None else 'null'
print('\npayload:', payload, file=fh)
payload_lookup.append([uri, payload if payload != 'null' else ''])
if d['pay'] and 'Enum' in d['pay']:
print('\nenumeration values:', file=fh)
for k in sorted(enums[tag]):
penum = re.sub(r'.*[-:/]', '', k)
puri = expand_prefix(k,prefixes)
print(' '+penum+':', expand_prefix(k,prefixes), file=fh)
enum_lookup.append([uri,penum,puri])
if d['sub']:
print('\nsubstructures:', file=fh)
for k,v in sorted(d['sub'].items()):
print(' "'+expand_prefix(k,prefixes)+'": "'+v+'"', file=fh)
else: print('\nsubstructures: []', file=fh)
if d['sup']:
print('\nsuperstructures:', file=fh)
for k,v in sorted(d['sup'].items()):
suri = expand_prefix(k,prefixes)
print(' "'+suri+'": "'+v+'"', file=fh)
struct_lookup.append([suri,ptag,uri])
cardinality_lookup.append([suri,uri,v])
else:
print('\nsuperstructures: []', file=fh)
struct_lookup.append(['',ptag,uri])
fh.write('...\n')
print('done')
for path in glob(join(dirname(specs[0]),'terms','*')):
tag = basename(path)
if tag not in g7:
print('copying', tag, '...', end=' ')
copyfile(path, join(dest,tag))
print('done')
if dest.endswith('/'): dest=dest[:-1]
base = dirname(dest)
for data,name in [
(struct_lookup, join(base,'substructures.tsv')),
(enum_lookup, join(base,'enumerations.tsv')),
(payload_lookup, join(base,'payloads.tsv')),
(cardinality_lookup, join(base,'cardinalities.tsv')),
]:
print('outputting', name, '...', end=' ')
with open(name, 'w') as f:
for row in data:
print('\t'.join(row), file=f)
print('done')