Skip to content

Commit

Permalink
Remove compound word boundary markers from the gold lemmas
Browse files Browse the repository at this point in the history
  • Loading branch information
aajanki committed May 2, 2022
1 parent 2b77c19 commit be59bee
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
2 changes: 1 addition & 1 deletion predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def predict_lemma_and_pos(model, testset, outputdir):


def remove_compund_word_boundary_markers(word):
return re.sub(r'(?<=\w)#(?=\w)', '', word)
return re.sub(r'(?<=[-–\w])#(?=\w)', '', word)


def write_results_conllu(f, predicted):
Expand Down
6 changes: 6 additions & 0 deletions preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import logging
import os
import os.path
import re


def main():
Expand Down Expand Up @@ -138,6 +139,7 @@ def preprocess(filename, inputdir, destdir, tag_map, aux_from_deprel):
num_non_continuous_index += 1
cols[0] = str(prev_index + 1)

cols[2] = remove_compound_word_boundaries(cols[2])
cols[3] = normalize_pos(cols[3], tag_map)

if is_invalid_pos_tag(cols[3]):
Expand Down Expand Up @@ -218,6 +220,10 @@ def split_into_sentences(lines):
yield sentence


def remove_compound_word_boundaries(lemma):
return re.sub(r'(?<=[-–\w])#(?=\w)', '', lemma)


def normalize_pos(pos, tag_map):
if '|' in pos:
# FTB2 has some erroneous tags like "N|Sg|Ine"
Expand Down

0 comments on commit be59bee

Please sign in to comment.