|
9 | 9 | // except according to those terms.
|
10 | 10 |
|
11 | 11 | use core::cmp;
|
12 |
| -use core::iter::Filter; |
13 | 12 |
|
14 |
| -use tables::sentence::SentenceCat; |
| 13 | +mod fwd { |
| 14 | + use tables::sentence::SentenceCat; |
| 15 | + use core::cmp; |
15 | 16 |
|
16 |
| -/// TODO |
17 |
| -#[derive(Clone)] |
| 17 | + #[derive(Clone, Copy, PartialEq, Eq)] |
| 18 | + enum StatePart { |
| 19 | + Sot, |
| 20 | + Eot, |
| 21 | + Other, |
| 22 | + CR, |
| 23 | + LF, |
| 24 | + Sep, |
| 25 | + ATerm, |
| 26 | + UpperLower, |
| 27 | + ClosePlus, |
| 28 | + SpPlus, |
| 29 | + STerm |
| 30 | + } |
| 31 | + |
| 32 | + #[derive(Clone, PartialEq, Eq)] |
| 33 | + struct SentenceBreaksState(pub [StatePart; 4]); |
| 34 | + |
| 35 | + const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([ |
| 36 | + StatePart::Sot, |
| 37 | + StatePart::Sot, |
| 38 | + StatePart::Sot, |
| 39 | + StatePart::Sot |
| 40 | + ]); |
| 41 | + |
| 42 | + pub struct SentenceBreaks<'a> { |
| 43 | + pub string: &'a str, |
| 44 | + pos: usize, |
| 45 | + state: SentenceBreaksState |
| 46 | + } |
| 47 | + |
| 48 | + impl SentenceBreaksState { |
| 49 | + fn next(&self, cat: SentenceCat) -> SentenceBreaksState { |
| 50 | + let &SentenceBreaksState(parts) = self; |
| 51 | + let parts = match (parts[3], cat) { |
| 52 | + (StatePart::ClosePlus, SentenceCat::SC_Close) => parts, |
| 53 | + (StatePart::SpPlus, SentenceCat::SC_Sp) => parts, |
| 54 | + _ => [ |
| 55 | + parts[1], |
| 56 | + parts[2], |
| 57 | + parts[3], |
| 58 | + match cat { |
| 59 | + SentenceCat::SC_CR => StatePart::CR, |
| 60 | + SentenceCat::SC_LF => StatePart::LF, |
| 61 | + SentenceCat::SC_Sep => StatePart::Sep, |
| 62 | + SentenceCat::SC_ATerm => StatePart::ATerm, |
| 63 | + SentenceCat::SC_Upper | |
| 64 | + SentenceCat::SC_Lower => StatePart::UpperLower, |
| 65 | + SentenceCat::SC_Close => StatePart::ClosePlus, |
| 66 | + SentenceCat::SC_Sp => StatePart::SpPlus, |
| 67 | + SentenceCat::SC_STerm => StatePart::STerm, |
| 68 | + _ => StatePart::Other |
| 69 | + } |
| 70 | + ] |
| 71 | + }; |
| 72 | + SentenceBreaksState(parts) |
| 73 | + } |
| 74 | + |
| 75 | + fn end(&self) -> SentenceBreaksState { |
| 76 | + let &SentenceBreaksState(parts) = self; |
| 77 | + SentenceBreaksState([ |
| 78 | + parts[1], |
| 79 | + parts[2], |
| 80 | + parts[3], |
| 81 | + StatePart::Eot |
| 82 | + ]) |
| 83 | + } |
| 84 | + |
| 85 | + fn match1(&self, part: StatePart) -> bool { |
| 86 | + let &SentenceBreaksState(parts) = self; |
| 87 | + part == parts[3] |
| 88 | + } |
| 89 | + |
| 90 | + fn match2(&self, part1: StatePart, part2: StatePart) -> bool { |
| 91 | + let &SentenceBreaksState(parts) = self; |
| 92 | + part1 == parts[2] && part2 == parts[3] |
| 93 | + } |
| 94 | + } |
| 95 | + |
| 96 | + fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool { |
| 97 | + let aterm_part = { |
| 98 | + // ATerm Close* Sp* |
| 99 | + let &SentenceBreaksState(parts) = state; |
| 100 | + let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 }; |
| 101 | + if parts[idx] == StatePart::ClosePlus { idx -= 1 } |
| 102 | + parts[idx] |
| 103 | + }; |
| 104 | + |
| 105 | + if aterm_part == StatePart::ATerm { |
| 106 | + use tables::sentence as se; |
| 107 | + |
| 108 | + for next_char in ahead.chars() { |
| 109 | + //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower |
| 110 | + match se::sentence_category(next_char) { |
| 111 | + se::SC_Lower => return true, |
| 112 | + se::SC_OLetter | |
| 113 | + se::SC_Upper | |
| 114 | + se::SC_Sep | se::SC_CR | se::SC_LF | |
| 115 | + se::SC_STerm | se::SC_ATerm => return false, |
| 116 | + _ => continue |
| 117 | + } |
| 118 | + } |
| 119 | + } |
| 120 | + |
| 121 | + false |
| 122 | + } |
| 123 | + |
| 124 | + fn match_sb8a(state: &SentenceBreaksState) -> bool { |
| 125 | + // SATerm Close* Sp* |
| 126 | + let &SentenceBreaksState(parts) = state; |
| 127 | + let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 }; |
| 128 | + if parts[idx] == StatePart::ClosePlus { idx -= 1 } |
| 129 | + parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm |
| 130 | + } |
| 131 | + |
| 132 | + fn match_sb9(state: &SentenceBreaksState) -> bool { |
| 133 | + // SATerm Close* |
| 134 | + let &SentenceBreaksState(parts) = state; |
| 135 | + let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 }; |
| 136 | + parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm |
| 137 | + } |
| 138 | + |
| 139 | + fn match_sb11(state: &SentenceBreaksState) -> bool { |
| 140 | + // SATerm Close* Sp* ParaSep? |
| 141 | + let &SentenceBreaksState(parts) = state; |
| 142 | + let mut idx = match parts[3] { |
| 143 | + StatePart::Sep | |
| 144 | + StatePart::CR | |
| 145 | + StatePart::LF => 2, |
| 146 | + _ => 3 |
| 147 | + }; |
| 148 | + |
| 149 | + if parts[idx] == StatePart::SpPlus { idx -= 1 } |
| 150 | + if parts[idx] == StatePart::ClosePlus { idx -= 1} |
| 151 | + |
| 152 | + parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm |
| 153 | + } |
| 154 | + |
| 155 | + impl<'a> Iterator for SentenceBreaks<'a> { |
| 156 | + // Returns the index of the character which follows a break |
| 157 | + type Item = usize; |
| 158 | + |
| 159 | + #[inline] |
| 160 | + fn size_hint(&self) -> (usize, Option<usize>) { |
| 161 | + let slen = self.string.len(); |
| 162 | + // A sentence could be one character |
| 163 | + (cmp::min(slen, 2), Some(slen + 1)) |
| 164 | + } |
| 165 | + |
| 166 | + #[inline] |
| 167 | + fn next(&mut self) -> Option<usize> { |
| 168 | + use tables::sentence as se; |
| 169 | + |
| 170 | + for next_char in self.string[self.pos..].chars() { |
| 171 | + let position_before = self.pos; |
| 172 | + let state_before = self.state.clone(); |
| 173 | + |
| 174 | + let next_cat = se::sentence_category(next_char); |
| 175 | + |
| 176 | + self.pos += next_char.len_utf8(); |
| 177 | + self.state = self.state.next(next_cat); |
| 178 | + |
| 179 | + match next_cat { |
| 180 | + // SB1 |
| 181 | + _ if state_before.match1(StatePart::Sot) => |
| 182 | + return Some(position_before), |
| 183 | + |
| 184 | + // SB3 |
| 185 | + SentenceCat::SC_LF if state_before.match1(StatePart::CR) => |
| 186 | + continue, |
| 187 | + |
| 188 | + // SB4 |
| 189 | + _ if state_before.match1(StatePart::Sep) |
| 190 | + || state_before.match1(StatePart::CR) |
| 191 | + || state_before.match1(StatePart::LF) |
| 192 | + => return Some(position_before), |
| 193 | + |
| 194 | + // SB5 |
| 195 | + SentenceCat::SC_Extend | |
| 196 | + SentenceCat::SC_Format => self.state = state_before, |
| 197 | + |
| 198 | + // SB6 |
| 199 | + SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => |
| 200 | + continue, |
| 201 | + |
| 202 | + // SB7 |
| 203 | + SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) => |
| 204 | + continue, |
| 205 | + |
| 206 | + // SB8 |
| 207 | + _ if match_sb8(&state_before, &self.string[position_before..]) => |
| 208 | + continue, |
| 209 | + |
| 210 | + // SB8a |
| 211 | + SentenceCat::SC_SContinue | |
| 212 | + SentenceCat::SC_STerm | |
| 213 | + SentenceCat::SC_ATerm if match_sb8a(&state_before) => |
| 214 | + continue, |
| 215 | + |
| 216 | + // SB9 |
| 217 | + SentenceCat::SC_Close | |
| 218 | + SentenceCat::SC_Sp | |
| 219 | + SentenceCat::SC_Sep | |
| 220 | + SentenceCat::SC_CR | |
| 221 | + SentenceCat::SC_LF if match_sb9(&state_before) => |
| 222 | + continue, |
| 223 | + |
| 224 | + // SB10 |
| 225 | + SentenceCat::SC_Sp | |
| 226 | + SentenceCat::SC_Sep | |
| 227 | + SentenceCat::SC_CR | |
| 228 | + SentenceCat::SC_LF if match_sb8a(&state_before) => |
| 229 | + continue, |
| 230 | + |
| 231 | + // SB11 |
| 232 | + _ if match_sb11(&state_before) => |
| 233 | + return Some(position_before), |
| 234 | + |
| 235 | + // SB998 |
| 236 | + _ => continue |
| 237 | + } |
| 238 | + } |
| 239 | + |
| 240 | + // SB2 |
| 241 | + if self.state.match1(StatePart::Sot) { |
| 242 | + None |
| 243 | + } else if self.state.match1(StatePart::Eot) { |
| 244 | + None |
| 245 | + } else { |
| 246 | + self.state = self.state.end(); |
| 247 | + Some(self.pos) |
| 248 | + } |
| 249 | + } |
| 250 | + } |
| 251 | + |
| 252 | + pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> { |
| 253 | + SentenceBreaks { string: source, pos: 0, state: INITIAL_STATE } |
| 254 | + } |
| 255 | + |
| 256 | +} |
| 257 | + |
| 258 | +/// TODO docs |
18 | 259 | pub struct USentenceBounds<'a> {
|
19 |
| - string: &'a str |
20 |
| - // state? |
| 260 | + iter: fwd::SentenceBreaks<'a>, |
| 261 | + sentence_start: Option<usize> |
| 262 | +} |
| 263 | + |
| 264 | +/// TODO docs |
| 265 | +pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> { |
| 266 | + USentenceBounds { |
| 267 | + iter: fwd::new_sentence_breaks(source), |
| 268 | + sentence_start: None |
| 269 | + } |
21 | 270 | }
|
22 | 271 |
|
23 | 272 | impl<'a> Iterator for USentenceBounds<'a> {
|
24 | 273 | type Item = &'a str;
|
25 | 274 |
|
26 | 275 | #[inline]
|
27 | 276 | fn size_hint(&self) -> (usize, Option<usize>) {
|
28 |
| - let slen = self.string.len(); |
29 |
| - (cmp::min(slen, 1), Some(slen)) |
| 277 | + let (lower, upper) = self.iter.size_hint(); |
| 278 | + (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1))) |
30 | 279 | }
|
31 | 280 |
|
32 | 281 | #[inline]
|
33 | 282 | fn next(&mut self) -> Option<&'a str> {
|
34 |
| - panic!("todo") |
35 |
| - } |
36 |
| -} |
| 283 | + if self.sentence_start == None { |
| 284 | + if let Some(start_pos) = self.iter.next() { |
| 285 | + self.sentence_start = Some(start_pos) |
| 286 | + } else { |
| 287 | + return None |
| 288 | + } |
| 289 | + } |
37 | 290 |
|
38 |
| -#[inline] |
39 |
| -pub fn new_sentence_bounds<'b>(s: &'b str) -> USentenceBounds<'b> { |
40 |
| - USentenceBounds { string: s } |
| 291 | + if let Some(break_pos) = self.iter.next() { |
| 292 | + let start_pos = self.sentence_start.unwrap(); |
| 293 | + let sentence = &self.iter.string[start_pos..break_pos]; |
| 294 | + self.sentence_start = Some(break_pos); |
| 295 | + Some(sentence) |
| 296 | + } else { |
| 297 | + None |
| 298 | + } |
| 299 | + } |
41 | 300 | }
|
0 commit comments