Skip to content

Commit

Permalink
More input data cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
aajanki committed May 1, 2022
1 parent 2ae04c5 commit 49287a9
Showing 1 changed file with 69 additions and 7 deletions.
76 changes: 69 additions & 7 deletions preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ def preprocess(filename, inputdir, destdir, tag_map, aux_from_deprel):
num_non_verb_aux = 0
num_missing_head = 0
num_non_continuous_index = 0
num_cycles = 0
num_duplicate_roots = 0
num_invalid_pos = 0

with open(in_path, 'r', encoding='utf-8') as inf, \
open(out_path, 'w', encoding='utf-8') as outf:
Expand All @@ -82,6 +85,9 @@ def preprocess(filename, inputdir, destdir, tag_map, aux_from_deprel):
num_sentences += 1

prev_index = 0
root_index = None
sentences_has_duplicate_root = False
preprocessed_columns = []
for token_line in sentence:
num_tokens += 1
if token_line == '' or token_line.startswith('#'):
Expand All @@ -93,19 +99,36 @@ def preprocess(filename, inputdir, destdir, tag_map, aux_from_deprel):
if len(cols) < 10:
num_too_few_columns += 1
# This happens a few times in FTB1/2. Most of these should
# be commas. We'll set all of them to commas with heads
# pointing to previous tokens.
# be commas. Try to guess the correct punctuation, and use
# comma if guessing fails.
#
# In few cases, the token should be something else, but
# we set them to commas anyway.
# Head is set to the previous token or next token if this
# is the first token in the sentence. This is obviously
# incorrect, but since we are not using the dependencies,
# it doesn't matter.
if len(cols) == 3 and cols[1] == '"' and cols[2] == '"':
c = '"'
else:
c = ','

if int(cols[0]) == 1:
head = 2
else:
head = int(cols[0]) - 1

cols = [
cols[0], ',', ',', 'PUNCT', 'punct', '_',
str(int(cols[0]) - 1), 'punct', '_', '_'
cols[0], c, c, 'PUNCT', 'punct', '_',
str(head), 'punct', '_', '_'
]
if len(cols) > 10:
num_too_many_columns += 1
cols = cols[:10]

if cols[6] == '0':
if root_index is not None:
sentences_has_duplicate_root = True
root_index = int(cols[0])

# Skip multiword tokens and empty nodes
if '-' in cols[0] or '.' in cols[0]:
num_multiword_tokens += 1
Expand All @@ -117,6 +140,9 @@ def preprocess(filename, inputdir, destdir, tag_map, aux_from_deprel):

cols[3] = normalize_pos(cols[3], tag_map)

if is_invalid_pos_tag(cols[3]):
num_invalid_pos += 1

if aux_from_deprel and cols[7] in ['aux', 'aux:pass', 'cop']:
if cols[3] in ['VERB', 'AUX']:
cols[3] = 'AUX'
Expand All @@ -135,23 +161,41 @@ def preprocess(filename, inputdir, destdir, tag_map, aux_from_deprel):

prev_index = int(cols[0])

preprocessed_columns.append(cols)

if sentences_has_duplicate_root:
# Skip sentences with duplicate roots.
num_duplicate_roots += 1
continue

if has_cycles(preprocessed_columns):
num_cycles += 1
continue

for cols in preprocessed_columns:
outf.write('\t'.join(cols))
outf.write('\n')
outf.write('\n')

logging.info(f'Processed {num_sentences} sentences with {num_tokens} tokens')
if num_too_few_columns > 0:
logging.warning(f'Replaced {num_too_few_columns} tokens with too few columns with commas.')
logging.warning(f'Replaced {num_too_few_columns} tokens with too few columns')
if num_too_many_columns > 0:
logging.warning(f'Encountered {num_too_many_columns} tokens with too many columns. Extra columns were ignored')
if num_non_verb_aux > 0:
logging.warning(f'Leaving {num_non_verb_aux} non-verb AUX tags unchanged')
if num_invalid_pos > 0:
logging.warning(f'Leaving {num_invalid_pos} invalid POS tags unchanged')
if num_multiword_tokens > 0:
logging.info(f'Skipped {num_multiword_tokens} multiword tokens')
if num_missing_head > 0:
logging.warning(f'Fixed {num_missing_head} missing heads')
if num_non_continuous_index > 0:
logging.warning(f'Detected {num_non_continuous_index} tokens with non-continuous indices')
if num_cycles > 0:
logging.warning(f'Skipped {num_cycles} sentences with a dependency cycle')
if num_duplicate_roots > 0:
logging.warning(f'Skipped {num_duplicate_roots} sentences with duplicated roots')


def split_into_sentences(lines):
Expand Down Expand Up @@ -182,5 +226,23 @@ def normalize_pos(pos, tag_map):
return tag_map.get(pos, pos)


def is_invalid_pos_tag(pos):
# DET is used only in FTB1
return pos not in [
'ADJ', 'ADV', 'ADP', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM',
'PROPN', 'PRON', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']


def has_cycles(lines):
for cols in lines:
i = int(cols[0])
parent = int(cols[6])
while parent != 0:
if parent == i:
return True
parent = int(lines[parent - 1][6])
return False


if __name__ == '__main__':
main()

0 comments on commit 49287a9

Please sign in to comment.