scrapinghub · kmike · Sep 8, 2017 · Sep 8, 2017 · Sep 8, 2017 · Sep 8, 2017
diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py
@@ -110,6 +110,14 @@ class DefaultTokenizer(WordTokenizer):
     def tokenize(self, text):
         tokens = super(DefaultTokenizer, self).tokenize(text)
         # remove standalone commas and semicolons
+        # as they broke tag sets, e.g. PERSON->FUNCTION in case "PERSON, FUNCTION"
+
+        # but it has negative consequences, e.g.
+        # etalon:    [PER-B, PER-I, FUNC-B]
+        # predicted: [PER-B, PER-I, PER-I ]
+        # because we removed punctuation
+
+        # FIXME: remove as token, but save as feature left/right_punct:","
         return [t for t in tokens if t not in {',', ';'}]