Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
leoalenc committed Sep 24, 2024
1 parent 65b97a5 commit 30513c7
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 15 deletions.
42 changes: 32 additions & 10 deletions data/corpus/universal-dependencies/yrl_complin-ud-test.conllu
Original file line number Diff line number Diff line change
Expand Up @@ -17944,6 +17944,27 @@
3 wara wara ADP ADP AdpType=Post|Clitic=Yes 2 case _ _
4 . . PUNCT PUNCT _ 2 punct _ SpaceAfter=No|TokenRange=17:18

# sent_id = Avila2021:0:0:726
# text = Regustari ramé Jesus Cristo, ti maã arama masuiwara indé.
# text_eng = If you love Jesus Christ, it doesn't matter where you're from.
# text_por = Se você gosta de Jesus Cristo, não importa de onde você é.
# text_source = Missão Novas Tribos do Brasil, 2016, canto 195, modif.
# text_annotator = Leonel Figueiredo de Alencar
# inputline = Regustari ramé/sconj Jesus Cristo/=p, ti maã/ind arama/adp masuiwara/advlc indé.
1 Regustari gustari VERB V Mood=Ind|Number=Sing|Person=2|VerbForm=Fin 7 advcl _ TokenRange=0:9
2 ramé ramé SCONJ SCONJ _ 1 mark _ TokenRange=10:14
3 Jesus jesus PROPN PROPN _ 1 obj _ TokenRange=15:20
4 Cristo cristo PROPN PROPN _ 3 flat _ SpaceAfter=No|TokenRange=21:27
5 , , PUNCT PUNCT _ 1 punct _ TokenRange=27:28
6 ti ti PART NEG PartType=Neg|Polarity=Neg 7 advmod _ TokenRange=29:31
7 maã maã PRON IND PronType=Ind 0 root _ TokenRange=32:35
8 arama arama ADP ADP AdpType=Post 7 case _ TokenRange=36:41
9-10 masuiwara _ _ _ _ _ _ _ TokenRange=42:51
9 masuí masuí ADV ADVRC AdvType=Loc|PronType=Int 7 csubj _ _
10 wara wara ADP ADP AdpType=Post|Clitic=Yes 9 case _ _
11 indé indé PRON PRON Number=Sing|Person=2|PronType=Prs 9 nsubj _ SpaceAfter=No|TokenRange=52:56
12 . . PUNCT PUNCT _ 7 punct _ SpaceAfter=No|TokenRange=56:57

# sent_id = NTLN2019:0:0:1
# text = Yakwawa tẽ Deus suiwara-itá yandé, yakwawa tẽ yuíri panhẽ kwá mundu uikú Yuruparí upé.
# text_eng = We know that we are children of God, and that the whole world is under the control of the evil one. [NIV]
Expand Down Expand Up @@ -23313,16 +23334,15 @@
# text_por_sec_source = Avila (2021)
# acknowledgement = DACILAT Project, FAPESP's Process No. 2022/09158-5
# reviewer1 = Leonel Figueiredo de Alencar
# review_status = ongoing
1 Mairamé mairamé SCONJ SCONJR _ 2 mark _ TokenRange=0:7
2 tausú sú VERB V Mood=Ind|Number=Plur|Person=3|VerbForm=Fin 10 advcl _ TokenRange=11:14
3 ã ã PART PFV Aspect=Perf 2 advmod _ TokenRange=15:16
4 taikú ikú AUX AUXFS Mood=Ind|Number=Plur|Person=3|VerbForm=Fin 2 aux _ SpaceAfter=No|TokenRange=20:24
5 , , PUNCT PUNCT _ 10 punct _ TokenRange=24:25
6 kwá kwá DET DEMX Deixis=Prox|Number=Sing|PronType=Dem 8 det _ TokenRange=26:29
7 ta ta PRON PRON2 Case=Gen|Number=Plur|Person=3|Poss=Yes|PronType=Prs 8 nmod:poss _ TokenRange=30:32
8 aría aría NOUN N Number=Sing 10 nsubj _ TokenRange=33:37
9 waimĩ waimĩ ADJ A _ 8 amod _ TokenRange=38:43
8 aría aría NOUN N Number=Sing 9 nmod:poss _ TokenRange=33:37
9 waimĩ waimĩ NOUN N Number=Sing 10 nsubj _ TokenRange=38:43
10 umbeú mbeú VERB V Mood=Ind|Person=3|VerbForm=Fin 0 root _ TokenRange=44:49
11 aintá aintá PRON PRON Number=Plur|Person=3|PronType=Prs 10 iobj _ SpaceAfter=No|TokenRange=50:55
12 : : PUNCT PUNCT _ 17 punct _ TokenRange=55:56
Expand All @@ -23336,15 +23356,17 @@
20 kuíri kuíri ADV ADVT AdvType=Tim 21 advmod _ TokenRange=91:96
21 pesú sú VERB V Mood=Imp|Number=Plur|Person=2|VerbForm=Fin 17 parataxis _ TokenRange=97:101
22 ã ã PART PFV Aspect=Perf 21 advmod _ SpaceAfter=No|TokenRange=102:103
23 , , PUNCT PUNCT _ 30 punct _ TokenRange=103:104
23 , , PUNCT PUNCT _ 27 punct _ TokenRange=103:104
24 ti ti PART NEG PartType=Neg|Polarity=Neg 27 advmod _ TokenRange=105:107
25 ã ã PART PFV Aspect=Perf 27 advmod _ TokenRange=108:109
26 kurí kurí PART FUT Tense=Fut 27 advmod _ TokenRange=110:114
27 marã marã ADV ADVRU AdvType=Cau|PronType=Int 30 acl _ TokenRange=115:119
28 penhẽ penhẽ PRON PRON Number=Plur|Person=2|PronType=Prs 27 obl _ TokenRange=120:125
29 arã arã ADP ADP AdpType=Post 28 case _ TokenRange=126:129
30 timbiú timbiú NOUN N Number=Sing|Rel=Abs 21 advcl _ SpaceAfter=No|TokenRange=130:136
31 . . PUNCT PUNCT _ 10 punct _ SpaceAfter=No|TokenRange=136:137
26 kurí kurí PART FUT Tense=Fut 27 advmod _ TokenRange=107:111
27-28 marã _ _ _ _ _ _ _ TokenRange=112:116
27 maã maã PRON IND PronType=Ind 17 parataxis _ _
28 arã arã ADP ADP AdpType=Post|Clitic=Yes 27 case _ _
29 penhẽ penhẽ PRON PRON Number=Plur|Person=2|PronType=Prs 27 obl _ TokenRange=117:122
30 arã arã ADP ADP AdpType=Post 29 case _ TokenRange=123:126
31 timbiú timbiú NOUN N Number=Sing|Rel=Abs 27 nsubj _ SpaceAfter=No|TokenRange=127:133
32 . . PUNCT PUNCT _ 10 punct _ SpaceAfter=No|TokenRange=133:134

# sent_id = Casasnovas2006:6:6:65
# text = Awá kurí uruyari se resé, puranga kurí usasá.
Expand Down
18 changes: 13 additions & 5 deletions src/AnnotateConllu.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: Leonel Figueiredo de Alencar
# Last update: September 18, 2024
# Last update: September 23, 2024

from Nheengatagger import getparselist, tokenize, DASHES, ELLIPSIS
from BuildDictionary import DIR,MAPPING, extract_feats, loadGlossary, loadLexicon, extractTags, isAux, accent, guessVerb, PRONOUNS, extractArchaicLemmas, IMPIND
Expand Down Expand Up @@ -129,13 +129,16 @@ def extractLemmaVariants(glossary,lemma='arama',pos='posp'):
UPE='upé'
PI=UPE

# normalized lemmatization of alomorph 'rã'
ARAMA='arã'

# clitic adverb "-ntu"
NTU='ntu'

# clitic question particle "-ta"
TA='taá'

NONHYPHEN=[NTU,ME,WARA,WERA]
NONHYPHEN=[NTU,ME,WARA,WERA, ARAMA]

ROOT=[]

Expand Down Expand Up @@ -1903,10 +1906,11 @@ def mkSuff(form,dic):
ntu={'xpos':'ADV','lemma':'ntu','clitic': NTU}
me={'xpos':'ADP','lemma':'upé','clitic':ME}
wara={'xpos':'ADP','lemma':'wara','clitic':WARA}
arama={'xpos':'ADP','lemma':'arã','clitic': ARAMA}
wera={'xpos':'FREQ','lemma':'wera','clitic':WERA}
pi={'xpos':'ADP','lemma':'upé','clitic':PI}
ta={'xpos': 'CQ','lemma':'taá','clitic':TA}
suffs=[ntu,me,pi,ta,wara,wera]
suffs=[ntu,me,pi,ta,wara,wera,arama]
for suff in suffs:
clitic=suff.get('clitic')
if clitic == form:
Expand Down Expand Up @@ -2502,6 +2506,8 @@ def mkConlluSentence(tokens):
newparselist=mkX(correct)
else:
newparselist=getparselist(correct.lower())
if xpos:
newparselist=filterparselist(xpos,newparselist)
elif tag == '=mf':
dic.update(mkModernForm(modern,attribute))
newparselist=getparselist(form.lower())
Expand Down Expand Up @@ -2560,7 +2566,7 @@ def mkConlluSentence(tokens):
new=_mkUpos(form,xpos, orig,orig_form)
newparselist=new['parselist']
elif tag == '=red':
new=handlePartialRedup(form,length)
new=handlePartialRedup(form,length,xpos='V',orig=orig, orig_form=orig_form)
newparselist=new['parselist']
elif tag == '=mid':
new=handleMiddlePassive(form)
Expand All @@ -2587,7 +2593,7 @@ def mkConlluSentence(tokens):
if correct_form and typo:
t['misc'].update({'CorrectForm': correct_form})
if xpos != 'X':
t['feats'].update({'Typo': 'Yes'})
updateFeats(t,'Typo', 'Yes')
t['form']=typo
modern_form=dic.get(attribute)
if modern_form:
Expand Down Expand Up @@ -2911,6 +2917,8 @@ def extractHost(token):
token,tag=pair
if form == 'maita':
return mkHost('mayé',TA,token,'ADVRA')
elif form == 'marã':
return mkHost('maã','arã',token,'IND')
else:
entry=getLocEntry(form)
if entry:
Expand Down

0 comments on commit 30513c7

Please sign in to comment.