Skip to content

Commit 14dbeb8

Browse files
committed
tests pass
1 parent 1c3e5af commit 14dbeb8

File tree

1 file changed

+273
-14
lines changed

1 file changed

+273
-14
lines changed

src/sentence.rs

Lines changed: 273 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,33 +9,292 @@
99
// except according to those terms.
1010

1111
use core::cmp;
12-
use core::iter::Filter;
1312

14-
use tables::sentence::SentenceCat;
13+
mod fwd {
14+
use tables::sentence::SentenceCat;
15+
use core::cmp;
1516

16-
/// TODO
17-
#[derive(Clone)]
17+
#[derive(Clone, Copy, PartialEq, Eq)]
18+
enum StatePart {
19+
Sot,
20+
Eot,
21+
Other,
22+
CR,
23+
LF,
24+
Sep,
25+
ATerm,
26+
UpperLower,
27+
ClosePlus,
28+
SpPlus,
29+
STerm
30+
}
31+
32+
#[derive(Clone, PartialEq, Eq)]
33+
struct SentenceBreaksState(pub [StatePart; 4]);
34+
35+
const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
36+
StatePart::Sot,
37+
StatePart::Sot,
38+
StatePart::Sot,
39+
StatePart::Sot
40+
]);
41+
42+
pub struct SentenceBreaks<'a> {
43+
pub string: &'a str,
44+
pos: usize,
45+
state: SentenceBreaksState
46+
}
47+
48+
impl SentenceBreaksState {
49+
fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
50+
let &SentenceBreaksState(parts) = self;
51+
let parts = match (parts[3], cat) {
52+
(StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
53+
(StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
54+
_ => [
55+
parts[1],
56+
parts[2],
57+
parts[3],
58+
match cat {
59+
SentenceCat::SC_CR => StatePart::CR,
60+
SentenceCat::SC_LF => StatePart::LF,
61+
SentenceCat::SC_Sep => StatePart::Sep,
62+
SentenceCat::SC_ATerm => StatePart::ATerm,
63+
SentenceCat::SC_Upper |
64+
SentenceCat::SC_Lower => StatePart::UpperLower,
65+
SentenceCat::SC_Close => StatePart::ClosePlus,
66+
SentenceCat::SC_Sp => StatePart::SpPlus,
67+
SentenceCat::SC_STerm => StatePart::STerm,
68+
_ => StatePart::Other
69+
}
70+
]
71+
};
72+
SentenceBreaksState(parts)
73+
}
74+
75+
fn end(&self) -> SentenceBreaksState {
76+
let &SentenceBreaksState(parts) = self;
77+
SentenceBreaksState([
78+
parts[1],
79+
parts[2],
80+
parts[3],
81+
StatePart::Eot
82+
])
83+
}
84+
85+
fn match1(&self, part: StatePart) -> bool {
86+
let &SentenceBreaksState(parts) = self;
87+
part == parts[3]
88+
}
89+
90+
fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
91+
let &SentenceBreaksState(parts) = self;
92+
part1 == parts[2] && part2 == parts[3]
93+
}
94+
}
95+
96+
fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
97+
let aterm_part = {
98+
// ATerm Close* Sp*
99+
let &SentenceBreaksState(parts) = state;
100+
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
101+
if parts[idx] == StatePart::ClosePlus { idx -= 1 }
102+
parts[idx]
103+
};
104+
105+
if aterm_part == StatePart::ATerm {
106+
use tables::sentence as se;
107+
108+
for next_char in ahead.chars() {
109+
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
110+
match se::sentence_category(next_char) {
111+
se::SC_Lower => return true,
112+
se::SC_OLetter |
113+
se::SC_Upper |
114+
se::SC_Sep | se::SC_CR | se::SC_LF |
115+
se::SC_STerm | se::SC_ATerm => return false,
116+
_ => continue
117+
}
118+
}
119+
}
120+
121+
false
122+
}
123+
124+
fn match_sb8a(state: &SentenceBreaksState) -> bool {
125+
// SATerm Close* Sp*
126+
let &SentenceBreaksState(parts) = state;
127+
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
128+
if parts[idx] == StatePart::ClosePlus { idx -= 1 }
129+
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
130+
}
131+
132+
fn match_sb9(state: &SentenceBreaksState) -> bool {
133+
// SATerm Close*
134+
let &SentenceBreaksState(parts) = state;
135+
let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 };
136+
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
137+
}
138+
139+
fn match_sb11(state: &SentenceBreaksState) -> bool {
140+
// SATerm Close* Sp* ParaSep?
141+
let &SentenceBreaksState(parts) = state;
142+
let mut idx = match parts[3] {
143+
StatePart::Sep |
144+
StatePart::CR |
145+
StatePart::LF => 2,
146+
_ => 3
147+
};
148+
149+
if parts[idx] == StatePart::SpPlus { idx -= 1 }
150+
if parts[idx] == StatePart::ClosePlus { idx -= 1}
151+
152+
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
153+
}
154+
155+
impl<'a> Iterator for SentenceBreaks<'a> {
156+
// Returns the index of the character which follows a break
157+
type Item = usize;
158+
159+
#[inline]
160+
fn size_hint(&self) -> (usize, Option<usize>) {
161+
let slen = self.string.len();
162+
// A sentence could be one character
163+
(cmp::min(slen, 2), Some(slen + 1))
164+
}
165+
166+
#[inline]
167+
fn next(&mut self) -> Option<usize> {
168+
use tables::sentence as se;
169+
170+
for next_char in self.string[self.pos..].chars() {
171+
let position_before = self.pos;
172+
let state_before = self.state.clone();
173+
174+
let next_cat = se::sentence_category(next_char);
175+
176+
self.pos += next_char.len_utf8();
177+
self.state = self.state.next(next_cat);
178+
179+
match next_cat {
180+
// SB1
181+
_ if state_before.match1(StatePart::Sot) =>
182+
return Some(position_before),
183+
184+
// SB3
185+
SentenceCat::SC_LF if state_before.match1(StatePart::CR) =>
186+
continue,
187+
188+
// SB4
189+
_ if state_before.match1(StatePart::Sep)
190+
|| state_before.match1(StatePart::CR)
191+
|| state_before.match1(StatePart::LF)
192+
=> return Some(position_before),
193+
194+
// SB5
195+
SentenceCat::SC_Extend |
196+
SentenceCat::SC_Format => self.state = state_before,
197+
198+
// SB6
199+
SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) =>
200+
continue,
201+
202+
// SB7
203+
SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
204+
continue,
205+
206+
// SB8
207+
_ if match_sb8(&state_before, &self.string[position_before..]) =>
208+
continue,
209+
210+
// SB8a
211+
SentenceCat::SC_SContinue |
212+
SentenceCat::SC_STerm |
213+
SentenceCat::SC_ATerm if match_sb8a(&state_before) =>
214+
continue,
215+
216+
// SB9
217+
SentenceCat::SC_Close |
218+
SentenceCat::SC_Sp |
219+
SentenceCat::SC_Sep |
220+
SentenceCat::SC_CR |
221+
SentenceCat::SC_LF if match_sb9(&state_before) =>
222+
continue,
223+
224+
// SB10
225+
SentenceCat::SC_Sp |
226+
SentenceCat::SC_Sep |
227+
SentenceCat::SC_CR |
228+
SentenceCat::SC_LF if match_sb8a(&state_before) =>
229+
continue,
230+
231+
// SB11
232+
_ if match_sb11(&state_before) =>
233+
return Some(position_before),
234+
235+
// SB998
236+
_ => continue
237+
}
238+
}
239+
240+
// SB2
241+
if self.state.match1(StatePart::Sot) {
242+
None
243+
} else if self.state.match1(StatePart::Eot) {
244+
None
245+
} else {
246+
self.state = self.state.end();
247+
Some(self.pos)
248+
}
249+
}
250+
}
251+
252+
pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
253+
SentenceBreaks { string: source, pos: 0, state: INITIAL_STATE }
254+
}
255+
256+
}
257+
258+
/// TODO docs
18259
pub struct USentenceBounds<'a> {
19-
string: &'a str
20-
// state?
260+
iter: fwd::SentenceBreaks<'a>,
261+
sentence_start: Option<usize>
262+
}
263+
264+
/// TODO docs
265+
pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
266+
USentenceBounds {
267+
iter: fwd::new_sentence_breaks(source),
268+
sentence_start: None
269+
}
21270
}
22271

23272
impl<'a> Iterator for USentenceBounds<'a> {
24273
type Item = &'a str;
25274

26275
#[inline]
27276
fn size_hint(&self) -> (usize, Option<usize>) {
28-
let slen = self.string.len();
29-
(cmp::min(slen, 1), Some(slen))
277+
let (lower, upper) = self.iter.size_hint();
278+
(cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
30279
}
31280

32281
#[inline]
33282
fn next(&mut self) -> Option<&'a str> {
34-
panic!("todo")
35-
}
36-
}
283+
if self.sentence_start == None {
284+
if let Some(start_pos) = self.iter.next() {
285+
self.sentence_start = Some(start_pos)
286+
} else {
287+
return None
288+
}
289+
}
37290

38-
#[inline]
39-
pub fn new_sentence_bounds<'b>(s: &'b str) -> USentenceBounds<'b> {
40-
USentenceBounds { string: s }
291+
if let Some(break_pos) = self.iter.next() {
292+
let start_pos = self.sentence_start.unwrap();
293+
let sentence = &self.iter.string[start_pos..break_pos];
294+
self.sentence_start = Some(break_pos);
295+
Some(sentence)
296+
} else {
297+
None
298+
}
299+
}
41300
}

0 commit comments

Comments
 (0)