More input data cleanup

aajanki · May 1, 2022 · 49287a9 · 49287a9
1 parent 2ae04c5
commit 49287a9
Showing 1 changed file with 69 additions and 7 deletions.
diff --git a/preprocess_data.py b/preprocess_data.py
@@ -74,6 +74,9 @@ def preprocess(filename, inputdir, destdir, tag_map, aux_from_deprel):
     num_non_verb_aux = 0
     num_missing_head = 0
     num_non_continuous_index = 0
+    num_cycles = 0
+    num_duplicate_roots = 0
+    num_invalid_pos = 0
 
     with open(in_path, 'r', encoding='utf-8') as inf, \
          open(out_path, 'w', encoding='utf-8') as outf:
@@ -82,6 +85,9 @@ def preprocess(filename, inputdir, destdir, tag_map, aux_from_deprel):
             num_sentences += 1
 
             prev_index = 0
+            root_index = None
+            sentences_has_duplicate_root = False
+            preprocessed_columns = []
             for token_line in sentence:
                 num_tokens += 1
                 if token_line == '' or token_line.startswith('#'):
@@ -93,19 +99,36 @@ def preprocess(filename, inputdir, destdir, tag_map, aux_from_deprel):
                 if len(cols) < 10:
                     num_too_few_columns += 1
                     # This happens a few times in FTB1/2. Most of these should
-                    # be commas. We'll set all of them to commas with heads
-                    # pointing to previous tokens.
+                    # be commas. Try to guess the correct punctuation, and use
+                    # comma if guessing fails.
                     #
-                    # In few cases, the token should be something else, but
-                    # we set them to commas anyway.
+                    # Head is set to the previous token or next token if this
+                    # is the first token in the sentence. This is obviously
+                    # incorrect, but since we are not using the dependencies,
+                    # it doesn't matter.
+                    if len(cols) == 3 and cols[1] == '"' and cols[2] == '"':
+                        c = '"'
+                    else:
+                        c = ','
+
+                    if int(cols[0]) == 1:
+                        head = 2
+                    else:
+                        head = int(cols[0]) - 1
+
                     cols = [
-                        cols[0], ',', ',', 'PUNCT', 'punct', '_',
-                        str(int(cols[0]) - 1), 'punct', '_', '_'
+                        cols[0], c, c, 'PUNCT', 'punct', '_',
+                        str(head), 'punct', '_', '_'
                     ]
                 if len(cols) > 10:
                     num_too_many_columns += 1
                     cols = cols[:10]
 
+                if cols[6] == '0':
+                    if root_index is not None:
+                        sentences_has_duplicate_root = True
+                    root_index = int(cols[0])
+
                 # Skip multiword tokens and empty nodes
                 if '-' in cols[0] or '.' in cols[0]:
                     num_multiword_tokens += 1
@@ -117,6 +140,9 @@ def preprocess(filename, inputdir, destdir, tag_map, aux_from_deprel):
 
                 cols[3] = normalize_pos(cols[3], tag_map)
 
+                if is_invalid_pos_tag(cols[3]):
+                    num_invalid_pos += 1
+
                 if aux_from_deprel and cols[7] in ['aux', 'aux:pass', 'cop']:
                     if cols[3] in ['VERB', 'AUX']:
                         cols[3] = 'AUX'
@@ -135,23 +161,41 @@ def preprocess(filename, inputdir, destdir, tag_map, aux_from_deprel):
 
                 prev_index = int(cols[0])
 
+                preprocessed_columns.append(cols)
+
+            if sentences_has_duplicate_root:
+                # Skip sentences with duplicate roots.
+                num_duplicate_roots += 1
+                continue
+
+            if has_cycles(preprocessed_columns):
+                num_cycles += 1
+                continue
+
+            for cols in preprocessed_columns:
                 outf.write('\t'.join(cols))
                 outf.write('\n')
             outf.write('\n')
 
     logging.info(f'Processed {num_sentences} sentences with {num_tokens} tokens')
     if num_too_few_columns > 0:
-        logging.warning(f'Replaced {num_too_few_columns} tokens with too few columns with commas.')
+        logging.warning(f'Replaced {num_too_few_columns} tokens with too few columns')
     if num_too_many_columns > 0:
         logging.warning(f'Encountered {num_too_many_columns} tokens with too many columns. Extra columns were ignored')
     if num_non_verb_aux > 0:
         logging.warning(f'Leaving {num_non_verb_aux} non-verb AUX tags unchanged')
+    if num_invalid_pos > 0:
+        logging.warning(f'Leaving {num_invalid_pos} invalid POS tags unchanged')
     if num_multiword_tokens > 0:
         logging.info(f'Skipped {num_multiword_tokens} multiword tokens')
     if num_missing_head > 0:
         logging.warning(f'Fixed {num_missing_head} missing heads')
     if num_non_continuous_index > 0:
         logging.warning(f'Detected {num_non_continuous_index} tokens with non-continuous indices')
+    if num_cycles > 0:
+        logging.warning(f'Skipped {num_cycles} sentences with a dependency cycle')
+    if num_duplicate_roots > 0:
+        logging.warning(f'Skipped {num_duplicate_roots} sentences with duplicated roots')
 
 
 def split_into_sentences(lines):
@@ -182,5 +226,23 @@ def normalize_pos(pos, tag_map):
     return tag_map.get(pos, pos)
 
 
+def is_invalid_pos_tag(pos):
+    # DET is used only in FTB1
+    return pos not in [
+        'ADJ', 'ADV', 'ADP', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM',
+        'PROPN', 'PRON', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
+
+
+def has_cycles(lines):
+    for cols in lines:
+        i = int(cols[0])
+        parent = int(cols[6])
+        while parent != 0:
+            if parent == i:
+                return True
+            parent = int(lines[parent - 1][6])
+    return False
+
+
 if __name__ == '__main__':
     main()