Skip to content

Commit 9c7abf2

Browse files
committed
Documentation and code reorg
1 parent 50058a5 commit 9c7abf2

File tree

1 file changed

+31
-21
lines changed

1 file changed

+31
-21
lines changed

src/sentence.rs

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ mod fwd {
1616
use tables::sentence::SentenceCat;
1717
use core::cmp;
1818

19+
// Describe a parsed part of source string as described in this table:
20+
// https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
1921
#[derive(Clone, Copy, PartialEq, Eq)]
2022
enum StatePart {
2123
Sot,
@@ -49,6 +51,8 @@ mod fwd {
4951
}
5052

5153
impl SentenceBreaksState {
54+
// Attempt to advance the internal state by one part
55+
// Whitespace and some punctutation will be collapsed
5256
fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
5357
let &SentenceBreaksState(parts) = self;
5458
let parts = match (parts[3], cat) {
@@ -85,27 +89,28 @@ mod fwd {
8589
])
8690
}
8791

92+
// Helper function to check if state head matches a single `StatePart`
8893
fn match1(&self, part: StatePart) -> bool {
8994
let &SentenceBreaksState(parts) = self;
9095
part == parts[3]
9196
}
9297

98+
// Helper function to check if first two `StateParts` in state match
99+
// the given two
93100
fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
94101
let &SentenceBreaksState(parts) = self;
95102
part1 == parts[2] && part2 == parts[3]
96103
}
97104
}
98105

106+
// https://unicode.org/reports/tr29/#SB8
107+
// TODO cache this, it is currently quadratic
99108
fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
100-
let aterm_part = {
101-
// ATerm Close* Sp*
102-
let &SentenceBreaksState(parts) = state;
103-
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
104-
if parts[idx] == StatePart::ClosePlus { idx -= 1 }
105-
parts[idx]
106-
};
109+
let &SentenceBreaksState(parts) = state;
110+
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
111+
if parts[idx] == StatePart::ClosePlus { idx -= 1 }
107112

108-
if aterm_part == StatePart::ATerm {
113+
if parts[idx] == StatePart::ATerm {
109114
use tables::sentence as se;
110115

111116
for next_char in ahead.chars() {
@@ -124,6 +129,7 @@ mod fwd {
124129
false
125130
}
126131

132+
// https://unicode.org/reports/tr29/#SB8a
127133
fn match_sb8a(state: &SentenceBreaksState) -> bool {
128134
// SATerm Close* Sp*
129135
let &SentenceBreaksState(parts) = state;
@@ -132,13 +138,15 @@ mod fwd {
132138
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
133139
}
134140

141+
// https://unicode.org/reports/tr29/#SB9
135142
fn match_sb9(state: &SentenceBreaksState) -> bool {
136143
// SATerm Close*
137144
let &SentenceBreaksState(parts) = state;
138145
let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 };
139146
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
140147
}
141148

149+
// https://unicode.org/reports/tr29/#SB11
142150
fn match_sb11(state: &SentenceBreaksState) -> bool {
143151
// SATerm Close* Sp* ParaSep?
144152
let &SentenceBreaksState(parts) = state;
@@ -180,67 +188,69 @@ mod fwd {
180188
self.state = self.state.next(next_cat);
181189

182190
match next_cat {
183-
// SB1
191+
// SB1 https://unicode.org/reports/tr29/#SB1
184192
_ if state_before.match1(StatePart::Sot) =>
185193
return Some(position_before),
186194

187-
// SB3
195+
// SB2 is handled when inner iterator (chars) is finished
196+
197+
// SB3 https://unicode.org/reports/tr29/#SB3
188198
SentenceCat::SC_LF if state_before.match1(StatePart::CR) =>
189199
continue,
190200

191-
// SB4
201+
// SB4 https://unicode.org/reports/tr29/#SB4
192202
_ if state_before.match1(StatePart::Sep)
193203
|| state_before.match1(StatePart::CR)
194204
|| state_before.match1(StatePart::LF)
195205
=> return Some(position_before),
196206

197-
// SB5
207+
// SB5 https://unicode.org/reports/tr29/#SB5
198208
SentenceCat::SC_Extend |
199209
SentenceCat::SC_Format => self.state = state_before,
200210

201-
// SB6
211+
// SB6 https://unicode.org/reports/tr29/#SB6
202212
SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) =>
203213
continue,
204214

205-
// SB7
215+
// SB7 https://unicode.org/reports/tr29/#SB7
206216
SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
207217
continue,
208218

209-
// SB8
219+
// SB8 https://unicode.org/reports/tr29/#SB8
210220
_ if match_sb8(&state_before, &self.string[position_before..]) =>
211221
continue,
212222

213-
// SB8a
223+
// SB8a https://unicode.org/reports/tr29/#SB8a
214224
SentenceCat::SC_SContinue |
215225
SentenceCat::SC_STerm |
216226
SentenceCat::SC_ATerm if match_sb8a(&state_before) =>
217227
continue,
218228

219-
// SB9
229+
// SB9 https://unicode.org/reports/tr29/#SB9
220230
SentenceCat::SC_Close |
221231
SentenceCat::SC_Sp |
222232
SentenceCat::SC_Sep |
223233
SentenceCat::SC_CR |
224234
SentenceCat::SC_LF if match_sb9(&state_before) =>
225235
continue,
226236

227-
// SB10
237+
// SB10 https://unicode.org/reports/tr29/#SB10
228238
SentenceCat::SC_Sp |
229239
SentenceCat::SC_Sep |
230240
SentenceCat::SC_CR |
231241
SentenceCat::SC_LF if match_sb8a(&state_before) =>
232242
continue,
233243

234-
// SB11
244+
// SB11 https://unicode.org/reports/tr29/#SB11
235245
_ if match_sb11(&state_before) =>
236246
return Some(position_before),
237247

238-
// SB998
248+
// SB998 https://unicode.org/reports/tr29/#SB998
239249
_ => continue
240250
}
241251
}
242252

243-
// SB2
253+
// SB2 https://unicode.org/reports/tr29/#SB2
244254
if self.state.match1(StatePart::Sot) {
245255
None
246256
} else if self.state.match1(StatePart::Eot) {

0 commit comments

Comments
 (0)