Skip to content

Commit

Permalink
simplemma surface forms
Browse files Browse the repository at this point in the history
  • Loading branch information
aajanki committed May 2, 2022
1 parent ff63b6d commit 70aeaf7
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions nlpmodels.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,10 +475,11 @@ def parse(self, texts):
res.append({'texts': words, 'lemmas': lemmas, 'pos': pos})
return res

def fix_surface_forms(self, destructive_tokenization, gold_sentence):
def fix_surface_forms(self, system_sentence, gold_sentence):
# The tokenizer leaves out some punctuation. Let's try to add it back.
i = 0
text = gold_sentence.text()
destructive_tokenization = system_sentence['texts']
non_destructive_tokenization = []
for t in destructive_tokenization:
m = re.compile(r'\s*(\W{1,2}\s*)?' + re.escape(t)).match(text, i)
Expand All @@ -497,7 +498,9 @@ def fix_surface_forms(self, destructive_tokenization, gold_sentence):
else:
raise ValueError('Failed to align tokenization')

return non_destructive_tokenization
fixed = dict(system_sentence)
fixed['texts'] = non_destructive_tokenization
return fixed


class UralicNLP:
Expand Down

0 comments on commit 70aeaf7

Please sign in to comment.