22sys .path .append ('../' )
33import cgel
44from cgel import Tree , Node , trees , Span
5- from typing import List , Tuple , Set , Mapping
5+ from typing import List , Tuple , Set , Mapping , Literal
6+ from conllu import Token , TokenList
7+ from udapi .core .document import Document
8+ from udapi .block .ud .fixpunct import FixPunct
69
710from collections import Counter , defaultdict
811
@@ -121,7 +124,8 @@ def remove_gaps(ctree: Tree) -> int:
121124 anode = ctree .tokens [a ]
122125
123126 # reassign a's function
124- assert anode .deprel in ('Prenucleus' , 'Postnucleus' ),anode .deprel
127+ olddeprel = anode .deprel
128+ assert olddeprel in ('Prenucleus' , 'Postnucleus' ),olddeprel
125129 anode .deprel = node .deprel
126130
127131 ap = anode .head
@@ -132,7 +136,10 @@ def remove_gaps(ctree: Tree) -> int:
132136 # unattach a from ap, reattach it under np
133137 ctree .children [ap ].remove (a )
134138 anode .head = np
135- ctree .children [np ].append (a )
139+ if olddeprel == 'Prenucleus' :
140+ ctree .children [np ].insert (0 ,a )
141+ else :
142+ ctree .children [np ].append (a )
136143 ctree .get_heads ()
137144
138145 # coindexation variable no longer needed
@@ -441,7 +448,7 @@ def mark_passive(ctree: Tree, feats: Mapping[int,Set[str]]) -> Mapping[int,Set[s
441448 feats [n ].add ('pass' )
442449 return feats
443450
444- def process_dependents (ctree : Tree , feats : Mapping [int ,Set [str ]], lexheads : Mapping [int ,int ]) -> Mapping [int ,str ]:
451+ def process_dependents (ctree : Tree , feats : Mapping [int ,Set [str ]], lexheads : Mapping [int ,int ]) -> Mapping [int , Tuple [ str , int | None , str | None , str | None ] ]:
445452 """
446453 (9) head-dependent rules
447454 """
@@ -533,6 +540,7 @@ def process_dependents(ctree: Tree, feats: Mapping[int,Set[str]], lexheads: Mapp
533540 Nom * Mod PP * nmod # TODO: ignoring :unmarked possibilities
534541 DP * Mod PP * nmod # 'at least 4' (TODO: cxn is subject to debate in UD)
535542 Nom * Comp PP * nmod
543+ PP * Comp PP * nmod # NPN cxn: 'from time to time'
536544 NP * Supplement NP * appos
537545 Nom * Mod NP * appos # integrated appos ('the term motor vehicle')
538546 * * Supplement * * parataxis
@@ -579,9 +587,9 @@ def meets_constraint(val: str | Node, feat: Set[str], constraint: str):
579587 HEADER = RULES_S .splitlines ()[1 ]
580588 column_starts = [i + 1 for i ,c in enumerate (HEADER ) if c == '|' ]
581589 RULE_LINES = RULES_S .splitlines ()[2 :- 1 ]
582- RULES = []
590+ RULES : List [ List [ str ]] = []
583591 for ln in RULE_LINES :
584- rule = []
592+ rule : List [ str ] = []
585593 for c ,j in enumerate (column_starts ):
586594 if c > 0 :
587595 rule .append (ln [column_starts [c - 1 ]:j ].strip ())
@@ -590,7 +598,7 @@ def meets_constraint(val: str | Node, feat: Set[str], constraint: str):
590598 rule [- 1 ] = r [:r .index ('#' )].strip ()
591599 RULES .append (rule )
592600
593- udeprels = {}
601+ udeprels : Mapping [ int , Tuple [ str , int | None , str | None , str | None ]] = {}
594602
595603 # Traverse the tree bottom-up. For each node, process rules in order.
596604 def _process_dependents (n : int ):
@@ -626,6 +634,7 @@ def _process_dependents(n: int):
626634 else :
627635 udeprels [lexheads [n ]] = (Result , lexheads [p ], plex .lexeme , nlex .lexeme )
628636 return
637+ # an error here indicates a configuration not covered by the rules table:
629638 assert lexheads [n ] in udeprels ,(n ,lexheads [n ],pcat ,plex ,nfxn ,ncat ,nlex ,ctree .draw_rec (n ,0 ),udeprels )
630639
631640 _process_dependents (ctree .root )
@@ -634,39 +643,82 @@ def _process_dependents(n: int):
634643
635644
636645
637-
638-
646+ def infer_upos_xpos (node : Node ) -> Tuple [str ,str ]:
647+ cgelpos = node .constituent
648+ if cgelpos == 'NP' : # a special case produced by relativizers_and_fusion() for relativizer 'that'
649+ cgelpos = 'WDT'
650+ upos = {'D' : 'DET' , 'N_pro' : 'PRON' , 'WDT' : 'PRON' , 'V_aux' : 'AUX' ,
651+ 'P' :' ADP' , 'N' : 'NOUN' , 'V' : 'VERB' ,
652+ 'Adj' : 'ADJ' , 'Adv' : 'ADV' , 'Int' : 'INTJ' ,
653+ 'Sdr' : 'SCONJ' , 'Coordinator' : 'CCONJ' }[cgelpos ]
654+ # TODO: lexical things like D->ADJ/PRON and P->ADV/SCONJ etc.
655+ xpos = node .xpos
656+ if xpos in ('CD' , 'LS' ):
657+ upos = 'NUM'
658+ # heuristically assign XPOS if not specified
659+ if not xpos :
660+ if cgelpos == 'N' :
661+ if node .lemma [0 ].isupper ():
662+ upos = 'PROPN'
663+ if node .lemma != node .lexeme :
664+ xpos = 'NNPS' if upos == 'PROPN' else 'NNS'
665+ else :
666+ xpos = 'NNP' if upos == 'PROPN' else 'NN'
667+ else :
668+ xpos = {'D' : 'WDT' if node .lemma == 'how' or node .lemma .startswith ('wh' ) else 'DT' ,
669+ 'N_pro' : 'WP' if node .lemma .startswith ('wh' ) else ('NN' if node .lemma .endswith (('one' ,'body' ,'thing' )) else 'PRP' ),
670+ 'P' :' IN' ,
671+ 'Adj' : 'JJ' ,
672+ 'Adv' : 'RB' ,
673+ 'Int' : 'UH' ,
674+ 'Sdr' : 'TO' if node .lemma == 'to' else 'IN' ,
675+ 'WDT' : 'WDT' , # special case, see above
676+ 'Coordinator' : 'CC' }[cgelpos ]
677+ if xpos in ('JJ' ,'RB' ):
678+ if node .lemma .endswith ('est' ) or node .lemma in ('most' ,'least' ):
679+ xpos += 'S'
680+ if node .lemma .endswith ('er' ) or node .lemma in ('more' ,'less' ):
681+ if node .lemma not in ('other' ,'hereinafter' ):
682+ xpos += 'R'
683+ # NOTE: PRP->PRP$ and WP->WP$ modifications will be made later based on deprel
684+
685+ return upos , xpos
639686
640687
641688def convert (ctree : Tree ):
642689 #print(ctree.draw())
643- udtokenized = [] # tuples (tokstr, ctree node index, suffix type)
690+ udtokenized : List [Tuple [str ,str ,int | None ,Literal ['fixed' ,'advmod' ,'case' ,'compound' ]| None ]] = []
691+ """tuples (tokstr, lemma, ctree node index, suffix type)"""
692+
644693 for n ,node in iter (ctree .tokens .items ()):
645694 for s in node .prepunct :
646- udtokenized .append ((s , None , None ))
695+ udtokenized .append ((s , s , None , None ))
647696 if node .text :
648697 if node .substrings : # CGEL lexeme has multiple UD tokens (:subt and/or :subp)
649- udtokenized .append ((node .substrings [0 ][1 ], n , None ))
698+ udtokenized .append ((node .substrings [0 ][1 ], node . lemma , n , None ))
650699 for fld ,subt in node .substrings [1 :]:
651700 if fld == ':subp' :
652- udtokenized .append ((subt , None , None ))
701+ udtokenized .append ((subt , subt , None , None ))
653702 continue
654703
655704 if ' ' in node .lexeme :
656705 sufftype = 'fixed'
657706 elif subt in ('not' , "n't" , 'nt' ):
658707 assert node .constituent == 'V_aux' ,node .constituent
659708 sufftype = 'advmod'
709+ subtlemma = 'not'
660710 elif subt in ('s' , "'s" , "'" ):
661711 assert node .constituent in ('N' ,'D' ),(node .constituent ,node .lexeme )
662712 sufftype = 'case'
713+ subtlemma = "'s"
663714 else :
664715 sufftype = 'compound'
665- udtokenized .append ((subt , None , sufftype ))
716+ subtlemma = subt
717+ udtokenized .append ((subt , subtlemma , None , sufftype ))
666718 else :
667- udtokenized .append ((node .text , n , None ))
719+ udtokenized .append ((node .text , node . lemma , n , None ))
668720 for s in node .postpunct :
669- udtokenized .append ((s , None , None ))
721+ udtokenized .append ((s , s , None , None ))
670722 origS = ctree .draw ()
671723 adjust_lexicalization (ctree )
672724 relativizers_and_fusion (ctree )
@@ -679,7 +731,7 @@ def convert(ctree: Tree):
679731 udeprels0 = demote_heads (ctree , feats )
680732 lexheads = propagate_heads (ctree )
681733 feats = mark_passive (ctree , feats )
682- udeprels = {}
734+ udeprels : Mapping [ int , Tuple [ str , int , str | None , str | None ]] = {}
683735 passive_aux_marked = set ()
684736 for n ,(h ,rel ) in sorted (udeprels0 .items (), reverse = True ): # RTL so we mark the rightmost aux dep of a passive verb as aux:pass, others as plain aux
685737 if rel == 'aux*' :
@@ -706,29 +758,87 @@ def convert(ctree: Tree):
706758 # print(finalS)
707759 # print(feats)
708760 # assert False
761+ conllutoks = []
709762 cur_n = None
710- for i ,(tok ,n ,sufftype ) in enumerate (udtokenized , start = 1 ):
763+ buffer = ctree .metadata ['text' ]
764+ for i ,(tok ,lemma ,n ,sufftype ) in enumerate (udtokenized , start = 1 ):
765+ surfacetok = buffer [:len (tok )]
766+ assert surfacetok .lower ().replace ('’' ,"'" )== tok .lower ().replace ('’' ,"'" ),(surfacetok ,tok )
767+ buffer = buffer [len (surfacetok ):]
768+ spaceafter = buffer .startswith (' ' )
769+ if spaceafter :
770+ buffer = buffer [1 :]
771+ if not buffer :
772+ spaceafter = 'EOS'
773+ correct = None
774+
711775 if n is not None :
712776 deprel = udeprels .get (n )
713777 if not deprel :
714- assert False ,(n ,lexheads [n ],ctree . tokens [ lexheads [ n ]]. lexeme ,udtokenized ,udeprels )
778+ assert False ,(n ,lexheads [n ],lemma ,udtokenized ,udeprels )
715779 cur_n = n
780+ # if ctree.tokens[lexheads.get(cur_n,cur_n)].constituent=='NP':
781+ # assert False,(n,lexheads[n],lemma,udtokenized,udeprels,ctree.tokens[lexheads.get(cur_n,cur_n)].deprel,nGapsRemoved)
782+ upos , xpos = infer_upos_xpos (ctree .tokens [lexheads .get (cur_n ,cur_n )])
783+ correct = ctree .tokens [lexheads .get (cur_n ,cur_n )].correct
716784 elif sufftype is not None :
717785 assert cur_n is not None
718- deprel = (sufftype , n , ctree .tokens [cur_n ].lexeme , tok )
786+ deprel = (sufftype , n , lemma , tok )
787+ match sufftype :
788+ case 'case' : # 's
789+ upos = 'PART'
790+ xpos = 'POS'
791+ case 'advmod' : # n't
792+ upos = 'PART'
793+ xpos = 'RB'
794+ case _: # compound, fixed
795+ upos , xpos = infer_upos_xpos (ctree .tokens [lexheads [cur_n ]])
719796 else :
720797 deprel = 'PUNCT'
798+ upos = 'PUNCT'
799+ xpos = {'!' : '.' , '?' : '.' , '--' : ':' , '—' : ':' }.get (tok ,tok )
721800
722801 if deprel == 'PUNCT' :
723- print (i , tok , deprel , sep = '\t ' )
802+ # attach all punct to root for now; clean up later with udapi FixPunct
803+ conllutoks .append (Token ({"id" : i , "form" : surfacetok , "lemma" : lemma ,
804+ "upos" : upos ,
805+ "xpos" : xpos ,
806+ "feats" : None , "head" : 0 , "deprel" : "punct" ,
807+ "deps" : None , "misc" : None if spaceafter else 'SpaceAfter=No' }))
808+ #print(i, tok, deprel, sep='\t')
724809 else :
725810 rel , h , hlexeme , nlexeme = deprel # h is the ctree node offset of the lexical head of the dependency
726811 if h is None :
727812 udh = 0 # root
728813 else :
729- udh = next (i for i ,(tok ,j ,sufftype ) in enumerate (udtokenized , start = 1 ) if j == h ) # UD token offset corresponding to h
730- print (i , tok , udh , rel , hlexeme , sep = '\t ' )
731- print ()
814+ udh = next (i for i ,(tok ,lem ,j ,sufftype ) in enumerate (udtokenized , start = 1 ) if j == h ) # UD token offset corresponding to h
815+
816+ if rel == 'nmod:poss' and upos == 'PRON' :
817+ xpos += '$'
818+ misc = []
819+ if correct :
820+ misc .append ('CorrectForm=' + correct )
821+ if not spaceafter :
822+ misc .append ('SpaceAfter=No' )
823+ conllutoks .append (Token ({"id" : i , "form" : surfacetok , "lemma" : lemma ,
824+ "upos" : upos , "xpos" : xpos , "feats" : 'Typo=Yes' if correct else None , "head" : udh ,
825+ "deprel" : rel , "deps" : None , "misc" : '|' .join (misc ) or None }))
826+ #print(i, tok, udh, rel, hlexeme, sep='\t')
827+ assert not buffer ,buffer
828+
829+ for k ,v in ctree .metadata .items ():
830+ print (f'# { k } = { v } ' )
831+ treeS = TokenList (conllutoks ).serialize ()
832+
833+ # all puncts were provisionally attached to root. use UDAPI to infer better attachments
834+ doc = Document ()
835+ doc .from_conllu_string (treeS )
836+ treeUDAPI = next (doc .trees )
837+ FixPunct ().process_tree (treeUDAPI )
838+ # remove metadata lines inserted by UDAPI as we have already printed the correct ones
839+ treeS = '\n ' .join (line for line in doc .to_conllu_string ().split ('\n ' ) if not line .startswith ('# ' ))
840+
841+ print (treeS )
732842
733843
734844inFP = sys .argv [1 ]
@@ -766,4 +876,6 @@ def convert(ctree: Tree):
766876UAS 93%, LAS 89%, LS(ignoring head) 93%. (not counting punct). exactly the same number of (wrong head, right deprel) and (wrong deprel, right head) pairs
767877
7688782nd experiment, twitter-etc-trial.cgel, after fixing a couple of bugs: LAS = 83% (vs. 89% from Stanza)
879+
880+ Dec. 2024: tweaked to cover 37 legal-cgel trees and print them in .conllu style
769881"""
0 commit comments