Skip to content

Commit

Permalink
Fix surface forms by inserting multi-word tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
aajanki committed May 2, 2022
1 parent 6d3bbd6 commit 2b77c19
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 20 deletions.
75 changes: 58 additions & 17 deletions nlpmodels.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,17 @@ def initialize(self):
def parse(self, texts):
return process_spacy(self.nlp, texts)

def fix_surface_forms(self, destructive_tokenization, gold_sentence):
def fix_surface_forms(self, system_sentence, gold_sentence):
# This is not even trying to be general, but fixes just enough for the
# CoNLL evaluation to run. The correct fix would be to handle these as
# multi-word tokens.
if len(destructive_tokenization) > 5 and destructive_tokenization[5] == 'En':
return destructive_tokenization[:5] + ['Em'] + destructive_tokenization[6:]
# CoNLL evaluation to run.
assert 'id' not in system_sentence

texts = system_sentence['texts']
if (len(gold_sentence.tokens) > 5 and gold_sentence.tokens[5].text == 'Emmä'
and len(texts) > 5 and texts[5] == 'En'):
return insert_multi_word(system_sentence, 5, '6-7', 'Emmä')
else:
return destructive_tokenization
return system_sentence


class Voikko:
Expand Down Expand Up @@ -352,18 +355,20 @@ def parse(self, texts):
res.append({'texts': words, 'lemmas': lemmas, 'pos': pos})
return res

def fix_surface_forms(self, destructive_tokenization, gold_sentence):
def fix_surface_forms(self, system_sentence, gold_sentence):
# This is not even trying to be general, but fixes just enough for the
# CoNLL evaluation to run. The correct fix would be to handle these as
# multi-word tokens.
if destructive_tokenization[0] == 'Eivtta':
return ['Ei'] + destructive_tokenization[1:]
elif destructive_tokenization[0] == 'EEttä':
return ['Ett'] + destructive_tokenization[1:]
elif len(destructive_tokenization) > 11 and destructive_tokenization[11] == 'mittä':
return destructive_tokenization[:11] + ['milt'] + destructive_tokenization[12:]
# CoNLL evaluation to run.
assert 'id' not in system_sentence

texts = system_sentence['texts']
if texts[0] == 'Eivtta' and texts[1] == 'vät':
return insert_multi_word(system_sentence, 0, '1-2', 'Eivät')
elif texts[0] == 'EEttä' and texts[1] == 'ekö':
return insert_multi_word(system_sentence, 0, '1-2', 'Ettekö')
elif len(texts) > 11 and texts[11] == 'mittä':
return insert_multi_word(system_sentence, 11, '12-13', 'miltei')
else:
return destructive_tokenization
return system_sentence


class SpacyFiExperimental:
Expand All @@ -387,7 +392,7 @@ def __init__(self, embedding='base'):
self.name = f'trankit-{embedding}'
self.embedding = f'xlm-roberta-{embedding}'
self.nlp = None
self.tokenizer_is_destructive = False
self.tokenizer_is_destructive = True

def initialize(self):
self.nlp = trankit.Pipeline('finnish',
Expand Down Expand Up @@ -416,6 +421,24 @@ def parse(self, texts):

return res

def fix_surface_forms(self, system_sentence, gold_sentence):
# This is not even trying to be general, but fixes just enough for the
# CoNLL evaluation to run.
assert 'id' not in system_sentence

text = system_sentence['texts']
if text[0] == 'Miksi' and text[1] == 'ei' and gold_sentence.tokens[0].text == 'Eikö':
fixed = insert_multi_word(system_sentence, 0, '1-2', 'Eikö')

if text[7] == 'eta' and text[8] == 'ei':
return insert_multi_word(fixed, 8 + 1, '8-9', 'ei')
else:
return fixed
elif len(text) > 6 and text[5] == 'en' and text[6] == 'mä' and gold_sentence.tokens[5].text == 'Emmä':
return insert_multi_word(system_sentence, 5, '6-7', 'Emmä')
else:
return system_sentence


class Simplemma:
def __init__(self):
Expand Down Expand Up @@ -549,6 +572,24 @@ def chunks(iterable, n):
return zip_longest(*args, fillvalue=None)


def insert_multi_word(sentence, index, multi_word_id, text):
ids = [str(x) for x in range(1, len(sentence['texts']) + 1)]
ids.insert(index, multi_word_id)
texts = list(sentence['texts'])
texts.insert(index, text)
pos = list(sentence['pos'])
pos.insert(index, '_')
lemmas = list(sentence['lemmas'])
lemmas.insert(index, '_')

return {
'id': ids,
'texts': texts,
'lemmas': lemmas,
'pos': pos,
}


all_models = [
UDPipe('fi-tdt'),
Voikko(),
Expand Down
8 changes: 5 additions & 3 deletions predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,7 @@ def predict_lemma_and_pos(model, testset, outputdir):
# TODO: Support for arbitrary sentence splits
assert len(predicted) == len(testset.sentences)
for system, gold in zip(predicted, testset.sentences):
system_fixed = dict(system)
system_fixed['texts'] = model.fix_surface_forms(system['texts'], gold)
system_fixed = model.fix_surface_forms(system, gold)
updated_predicted.append(system_fixed)

predicted = updated_predicted
Expand Down Expand Up @@ -124,7 +123,10 @@ def write_results_conllu(f, predicted):
it = zip_longest(ids, observed_words, observed_lemmas, observed_pos, fillvalue='')
for (i, orth, lemma, pos) in it:
nlemma = remove_compund_word_boundary_markers(lemma)
if i == '1':
if '-' in i:
fake_head = '_'
fake_rel = '_'
elif i == '1':
fake_head = '0'
fake_rel = 'root'
else:
Expand Down

0 comments on commit 2b77c19

Please sign in to comment.