@@ -45,7 +45,8 @@ impl<'a, P> PeriodContextTokenizer<'a, P> where P: DefinesNonWordCharacters + De
45
45
let mut pos = self . pos ;
46
46
47
47
while pos < self . doc . len ( ) {
48
- let cur = self . doc . char_at ( pos) ;
48
+ let mut iter = self . doc [ pos..] . chars ( ) ;
49
+ let cur = iter. nth ( 0 ) . unwrap ( ) ;
49
50
50
51
match cur {
51
52
// A whitespace is reached before a sentence ending character
@@ -55,7 +56,7 @@ impl<'a, P> PeriodContextTokenizer<'a, P> where P: DefinesNonWordCharacters + De
55
56
// of a new token (if there is a space after it, or if the next
56
57
// character is puntuation).
57
58
c if P :: is_sentence_ending ( & c) => {
58
- let nxt = self . doc . char_at ( pos + cur . len_utf8 ( ) ) ;
59
+ let nxt = iter . next ( ) . unwrap ( ) ;
59
60
60
61
if nxt. is_whitespace ( ) || P :: is_nonword_char ( & nxt) {
61
62
break ;
@@ -86,7 +87,7 @@ impl<'a, P> Iterator for PeriodContextTokenizer<'a, P>
86
87
let mut state: u8 = 0 ;
87
88
88
89
while self . pos < self . doc . len ( ) {
89
- let cur = self . doc . char_at ( self . pos ) ;
90
+ let cur = self . doc [ self . pos .. ] . chars ( ) . next ( ) . unwrap ( ) ;
90
91
91
92
macro_rules! return_token(
92
93
( ) => (
@@ -240,7 +241,7 @@ impl<'a, P> Iterator for WordTokenizer<'a, P>
240
241
) ;
241
242
242
243
while self . pos < self . doc . len ( ) {
243
- let cur = self . doc . char_at ( self . pos ) ;
244
+ let cur = self . doc [ self . pos .. ] . chars ( ) . next ( ) . unwrap ( ) ;
244
245
245
246
// Periods or dashes are the start of multi-chars. A multi-char
246
247
// is defined as an ellipsis or hyphen (multiple-dashes). If there
@@ -496,7 +497,7 @@ fn orthographic_heuristic<P>(tok: &Token, data: &TrainingData) -> Option<bool>
496
497
{
497
498
use prelude:: { ORT_LC , MID_UC , ORT_UC , BEG_LC } ;
498
499
499
- if P :: is_punctuation ( & tok. tok ( ) . char_at ( 0 ) ) {
500
+ if P :: is_punctuation ( & tok. tok ( ) . chars ( ) . nth ( 0 ) . unwrap ( ) ) {
500
501
Some ( false )
501
502
} else {
502
503
let ctxt = data. get_orthographic_context ( tok. typ_without_break_or_period ( ) ) ;
0 commit comments