Skip to content

Commit

Permalink
Fix tokenizer bug when just quotes before colon
Browse files Browse the repository at this point in the history
This fixes a bug where ChemWordTokenizer raised an IndexError when it encountered a quote, followed by a colon, followed by a digit.
  • Loading branch information
mcs07 committed Jan 11, 2017
1 parent 395681f commit daeaaa4
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 1 deletion.
2 changes: 1 addition & 1 deletion chemdataextractor/nlp/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,7 +657,7 @@ def _subspan(self, s, span, nextspan):
after = text[i+1:]
if char in {':', ';'}:
# Split around colon unless it looks like we're in a chemical name
if not (before and after and after[0].isdigit() and before.rstrip('′\'')[-1].isdigit() and '-' in after) and not (self.NO_SPLIT_CHEM.search(before) and self.NO_SPLIT_CHEM.search(after)):
if not (before and after and after[0].isdigit() and before.rstrip('′\'')[-1:].isdigit() and '-' in after) and not (self.NO_SPLIT_CHEM.search(before) and self.NO_SPLIT_CHEM.search(after)):
return self._split_span(span, i, 1)
elif char in {'x', '+', '−'}:
# Split around x, +, − (\u2212 minus) between two numbers or at start followed by numbers
Expand Down
4 changes: 4 additions & 0 deletions tests/test_nlp_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,10 @@ def test_bracket_whitespace_error(self):
self.assertEqual(['7.95', '(', 's', ',', '4H', ')'], self.t.tokenize('7.95(s, 4H)'))
self.assertEqual(['In', 'Fig.', '5', '(', 'a', ',', 'b', ')'], self.t.tokenize('In Fig. 5(a, b)'))

def test_quote_colon(self):
"""Test the word tokenizer quote followed by colon followed by digit (IndexError bugfix)."""
self.assertEqual(['\'', ':', '1'], self.t.tokenize('\':1'))

def test_chemtext_sentence(self):
"""Test tokenization through the Text and Sentence API."""
t = Text('Hi, my name is Matt. What is your name?')
Expand Down

0 comments on commit daeaaa4

Please sign in to comment.