Skip to content

Commit 3bbb1fa

Browse files
committed
cgel2ud.py: full-fledged UD conversion incl. .conllu format, metadata lines, UPOS, XPOS, lemmas, Typo/SpaceAfter. (No morph feats or enhanced deps.)
1 parent 9a29c30 commit 3bbb1fa

File tree

2 files changed

+138
-25
lines changed

2 files changed

+138
-25
lines changed

convertor/cgel2ud.py

Lines changed: 136 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
sys.path.append('../')
33
import cgel
44
from cgel import Tree, Node, trees, Span
5-
from typing import List, Tuple, Set, Mapping
5+
from typing import List, Tuple, Set, Mapping, Literal
6+
from conllu import Token, TokenList
7+
from udapi.core.document import Document
8+
from udapi.block.ud.fixpunct import FixPunct
69

710
from collections import Counter, defaultdict
811

@@ -121,7 +124,8 @@ def remove_gaps(ctree: Tree) -> int:
121124
anode = ctree.tokens[a]
122125

123126
# reassign a's function
124-
assert anode.deprel in ('Prenucleus', 'Postnucleus'),anode.deprel
127+
olddeprel = anode.deprel
128+
assert olddeprel in ('Prenucleus', 'Postnucleus'),olddeprel
125129
anode.deprel = node.deprel
126130

127131
ap = anode.head
@@ -132,7 +136,10 @@ def remove_gaps(ctree: Tree) -> int:
132136
# unattach a from ap, reattach it under np
133137
ctree.children[ap].remove(a)
134138
anode.head = np
135-
ctree.children[np].append(a)
139+
if olddeprel=='Prenucleus':
140+
ctree.children[np].insert(0,a)
141+
else:
142+
ctree.children[np].append(a)
136143
ctree.get_heads()
137144

138145
# coindexation variable no longer needed
@@ -441,7 +448,7 @@ def mark_passive(ctree: Tree, feats: Mapping[int,Set[str]]) -> Mapping[int,Set[s
441448
feats[n].add('pass')
442449
return feats
443450

444-
def process_dependents(ctree: Tree, feats: Mapping[int,Set[str]], lexheads: Mapping[int,int]) -> Mapping[int,str]:
451+
def process_dependents(ctree: Tree, feats: Mapping[int,Set[str]], lexheads: Mapping[int,int]) -> Mapping[int, Tuple[str,int|None,str|None,str|None]]:
445452
"""
446453
(9) head-dependent rules
447454
"""
@@ -533,6 +540,7 @@ def process_dependents(ctree: Tree, feats: Mapping[int,Set[str]], lexheads: Mapp
533540
Nom * Mod PP * nmod # TODO: ignoring :unmarked possibilities
534541
DP * Mod PP * nmod # 'at least 4' (TODO: cxn is subject to debate in UD)
535542
Nom * Comp PP * nmod
543+
PP * Comp PP * nmod # NPN cxn: 'from time to time'
536544
NP * Supplement NP * appos
537545
Nom * Mod NP * appos # integrated appos ('the term motor vehicle')
538546
* * Supplement * * parataxis
@@ -579,9 +587,9 @@ def meets_constraint(val: str | Node, feat: Set[str], constraint: str):
579587
HEADER = RULES_S.splitlines()[1]
580588
column_starts = [i+1 for i,c in enumerate(HEADER) if c=='|']
581589
RULE_LINES = RULES_S.splitlines()[2:-1]
582-
RULES = []
590+
RULES: List[List[str]] = []
583591
for ln in RULE_LINES:
584-
rule = []
592+
rule: List[str] = []
585593
for c,j in enumerate(column_starts):
586594
if c>0:
587595
rule.append(ln[column_starts[c-1]:j].strip())
@@ -590,7 +598,7 @@ def meets_constraint(val: str | Node, feat: Set[str], constraint: str):
590598
rule[-1] = r[:r.index('#')].strip()
591599
RULES.append(rule)
592600

593-
udeprels = {}
601+
udeprels: Mapping[int, Tuple[str,int|None,str|None,str|None]] = {}
594602

595603
# Traverse the tree bottom-up. For each node, process rules in order.
596604
def _process_dependents(n: int):
@@ -626,6 +634,7 @@ def _process_dependents(n: int):
626634
else:
627635
udeprels[lexheads[n]] = (Result, lexheads[p], plex.lexeme, nlex.lexeme)
628636
return
637+
# an error here indicates a configuration not covered by the rules table:
629638
assert lexheads[n] in udeprels,(n,lexheads[n],pcat,plex,nfxn,ncat,nlex,ctree.draw_rec(n,0),udeprels)
630639

631640
_process_dependents(ctree.root)
@@ -634,39 +643,82 @@ def _process_dependents(n: int):
634643

635644

636645

637-
638-
646+
def infer_upos_xpos(node: Node) -> Tuple[str,str]:
647+
cgelpos = node.constituent
648+
if cgelpos=='NP': # a special case produced by relativizers_and_fusion() for relativizer 'that'
649+
cgelpos = 'WDT'
650+
upos = {'D': 'DET', 'N_pro': 'PRON', 'WDT': 'PRON', 'V_aux': 'AUX',
651+
'P':' ADP', 'N': 'NOUN', 'V': 'VERB',
652+
'Adj': 'ADJ', 'Adv': 'ADV', 'Int': 'INTJ',
653+
'Sdr': 'SCONJ', 'Coordinator': 'CCONJ'}[cgelpos]
654+
# TODO: lexical things like D->ADJ/PRON and P->ADV/SCONJ etc.
655+
xpos = node.xpos
656+
if xpos in ('CD', 'LS'):
657+
upos = 'NUM'
658+
# heuristically assign XPOS if not specified
659+
if not xpos:
660+
if cgelpos=='N':
661+
if node.lemma[0].isupper():
662+
upos = 'PROPN'
663+
if node.lemma != node.lexeme:
664+
xpos = 'NNPS' if upos=='PROPN' else 'NNS'
665+
else:
666+
xpos = 'NNP' if upos=='PROPN' else 'NN'
667+
else:
668+
xpos = {'D': 'WDT' if node.lemma=='how' or node.lemma.startswith('wh') else 'DT',
669+
'N_pro': 'WP' if node.lemma.startswith('wh') else ('NN' if node.lemma.endswith(('one','body','thing')) else 'PRP'),
670+
'P':' IN',
671+
'Adj': 'JJ',
672+
'Adv': 'RB',
673+
'Int': 'UH',
674+
'Sdr': 'TO' if node.lemma=='to' else 'IN',
675+
'WDT': 'WDT', # special case, see above
676+
'Coordinator': 'CC'}[cgelpos]
677+
if xpos in ('JJ','RB'):
678+
if node.lemma.endswith('est') or node.lemma in ('most','least'):
679+
xpos += 'S'
680+
if node.lemma.endswith('er') or node.lemma in ('more','less'):
681+
if node.lemma not in ('other','hereinafter'):
682+
xpos += 'R'
683+
# NOTE: PRP->PRP$ and WP->WP$ modifications will be made later based on deprel
684+
685+
return upos, xpos
639686

640687

641688
def convert(ctree: Tree):
642689
#print(ctree.draw())
643-
udtokenized = [] # tuples (tokstr, ctree node index, suffix type)
690+
udtokenized: List[Tuple[str,str,int|None,Literal['fixed','advmod','case','compound']|None]] = []
691+
"""tuples (tokstr, lemma, ctree node index, suffix type)"""
692+
644693
for n,node in iter(ctree.tokens.items()):
645694
for s in node.prepunct:
646-
udtokenized.append((s, None, None))
695+
udtokenized.append((s, s, None, None))
647696
if node.text:
648697
if node.substrings: # CGEL lexeme has multiple UD tokens (:subt and/or :subp)
649-
udtokenized.append((node.substrings[0][1], n, None))
698+
udtokenized.append((node.substrings[0][1], node.lemma, n, None))
650699
for fld,subt in node.substrings[1:]:
651700
if fld==':subp':
652-
udtokenized.append((subt, None, None))
701+
udtokenized.append((subt, subt, None, None))
653702
continue
654703

655704
if ' ' in node.lexeme:
656705
sufftype = 'fixed'
657706
elif subt in ('not', "n't", 'nt'):
658707
assert node.constituent=='V_aux',node.constituent
659708
sufftype = 'advmod'
709+
subtlemma = 'not'
660710
elif subt in ('s', "'s", "'"):
661711
assert node.constituent in ('N','D'),(node.constituent,node.lexeme)
662712
sufftype = 'case'
713+
subtlemma = "'s"
663714
else:
664715
sufftype = 'compound'
665-
udtokenized.append((subt, None, sufftype))
716+
subtlemma = subt
717+
udtokenized.append((subt, subtlemma, None, sufftype))
666718
else:
667-
udtokenized.append((node.text, n, None))
719+
udtokenized.append((node.text, node.lemma, n, None))
668720
for s in node.postpunct:
669-
udtokenized.append((s, None, None))
721+
udtokenized.append((s, s, None, None))
670722
origS = ctree.draw()
671723
adjust_lexicalization(ctree)
672724
relativizers_and_fusion(ctree)
@@ -679,7 +731,7 @@ def convert(ctree: Tree):
679731
udeprels0 = demote_heads(ctree, feats)
680732
lexheads = propagate_heads(ctree)
681733
feats = mark_passive(ctree, feats)
682-
udeprels = {}
734+
udeprels: Mapping[int, Tuple[str,int,str|None,str|None]] = {}
683735
passive_aux_marked = set()
684736
for n,(h,rel) in sorted(udeprels0.items(), reverse=True): # RTL so we mark the rightmost aux dep of a passive verb as aux:pass, others as plain aux
685737
if rel=='aux*':
@@ -706,29 +758,87 @@ def convert(ctree: Tree):
706758
# print(finalS)
707759
# print(feats)
708760
# assert False
761+
conllutoks = []
709762
cur_n = None
710-
for i,(tok,n,sufftype) in enumerate(udtokenized, start=1):
763+
buffer = ctree.metadata['text']
764+
for i,(tok,lemma,n,sufftype) in enumerate(udtokenized, start=1):
765+
surfacetok = buffer[:len(tok)]
766+
assert surfacetok.lower().replace('’',"'")==tok.lower().replace('’',"'"),(surfacetok,tok)
767+
buffer = buffer[len(surfacetok):]
768+
spaceafter = buffer.startswith(' ')
769+
if spaceafter:
770+
buffer = buffer[1:]
771+
if not buffer:
772+
spaceafter = 'EOS'
773+
correct = None
774+
711775
if n is not None:
712776
deprel = udeprels.get(n)
713777
if not deprel:
714-
assert False,(n,lexheads[n],ctree.tokens[lexheads[n]].lexeme,udtokenized,udeprels)
778+
assert False,(n,lexheads[n],lemma,udtokenized,udeprels)
715779
cur_n = n
780+
# if ctree.tokens[lexheads.get(cur_n,cur_n)].constituent=='NP':
781+
# assert False,(n,lexheads[n],lemma,udtokenized,udeprels,ctree.tokens[lexheads.get(cur_n,cur_n)].deprel,nGapsRemoved)
782+
upos, xpos = infer_upos_xpos(ctree.tokens[lexheads.get(cur_n,cur_n)])
783+
correct = ctree.tokens[lexheads.get(cur_n,cur_n)].correct
716784
elif sufftype is not None:
717785
assert cur_n is not None
718-
deprel = (sufftype, n, ctree.tokens[cur_n].lexeme, tok)
786+
deprel = (sufftype, n, lemma, tok)
787+
match sufftype:
788+
case 'case': # 's
789+
upos = 'PART'
790+
xpos = 'POS'
791+
case 'advmod': # n't
792+
upos = 'PART'
793+
xpos = 'RB'
794+
case _: # compound, fixed
795+
upos, xpos = infer_upos_xpos(ctree.tokens[lexheads[cur_n]])
719796
else:
720797
deprel = 'PUNCT'
798+
upos = 'PUNCT'
799+
xpos = {'!': '.', '?': '.', '--': ':', '—': ':'}.get(tok,tok)
721800

722801
if deprel=='PUNCT':
723-
print(i, tok, deprel, sep='\t')
802+
# attach all punct to root for now; clean up later with udapi FixPunct
803+
conllutoks.append(Token({"id": i, "form": surfacetok, "lemma": lemma,
804+
"upos": upos,
805+
"xpos": xpos,
806+
"feats": None, "head": 0, "deprel": "punct",
807+
"deps": None, "misc": None if spaceafter else 'SpaceAfter=No'}))
808+
#print(i, tok, deprel, sep='\t')
724809
else:
725810
rel, h, hlexeme, nlexeme = deprel # h is the ctree node offset of the lexical head of the dependency
726811
if h is None:
727812
udh = 0 # root
728813
else:
729-
udh = next(i for i,(tok,j,sufftype) in enumerate(udtokenized, start=1) if j==h) # UD token offset corresponding to h
730-
print(i, tok, udh, rel, hlexeme, sep='\t')
731-
print()
814+
udh = next(i for i,(tok,lem,j,sufftype) in enumerate(udtokenized, start=1) if j==h) # UD token offset corresponding to h
815+
816+
if rel=='nmod:poss' and upos=='PRON':
817+
xpos += '$'
818+
misc = []
819+
if correct:
820+
misc.append('CorrectForm=' + correct)
821+
if not spaceafter:
822+
misc.append('SpaceAfter=No')
823+
conllutoks.append(Token({"id": i, "form": surfacetok, "lemma": lemma,
824+
"upos": upos, "xpos": xpos, "feats": 'Typo=Yes' if correct else None, "head": udh,
825+
"deprel": rel, "deps": None, "misc": '|'.join(misc) or None}))
826+
#print(i, tok, udh, rel, hlexeme, sep='\t')
827+
assert not buffer,buffer
828+
829+
for k,v in ctree.metadata.items():
830+
print(f'# {k} = {v}')
831+
treeS = TokenList(conllutoks).serialize()
832+
833+
# all puncts were provisionally attached to root. use UDAPI to infer better attachments
834+
doc = Document()
835+
doc.from_conllu_string(treeS)
836+
treeUDAPI = next(doc.trees)
837+
FixPunct().process_tree(treeUDAPI)
838+
# remove metadata lines inserted by UDAPI as we have already printed the correct ones
839+
treeS = '\n'.join(line for line in doc.to_conllu_string().split('\n') if not line.startswith('# '))
840+
841+
print(treeS)
732842

733843

734844
inFP = sys.argv[1]
@@ -766,4 +876,6 @@ def convert(ctree: Tree):
766876
UAS 93%, LAS 89%, LS(ignoring head) 93%. (not counting punct). exactly the same number of (wrong head, right deprel) and (wrong deprel, right head) pairs
767877
768878
2nd experiment, twitter-etc-trial.cgel, after fixing a couple of bugs: LAS = 83% (vs. 89% from Stanza)
879+
880+
Dec. 2024: tweaked to cover 37 legal-cgel trees and print them in .conllu style
769881
"""

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ conllu>=4.0
44
levenshtein>=0.20.9
55
depedit==3.2.1.0
66
pandas>=2.2.2
7-
tabulate>=0.9.0
7+
tabulate>=0.9.0
8+
udapi>=0.3.0

0 commit comments

Comments
 (0)