@@ -16,6 +16,8 @@ mod fwd {
16
16
use tables:: sentence:: SentenceCat ;
17
17
use core:: cmp;
18
18
19
+ // Describe a parsed part of source string as described in this table:
20
+ // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
19
21
#[ derive( Clone , Copy , PartialEq , Eq ) ]
20
22
enum StatePart {
21
23
Sot ,
@@ -49,6 +51,8 @@ mod fwd {
49
51
}
50
52
51
53
impl SentenceBreaksState {
54
+ // Attempt to advance the internal state by one part
55
+ // Whitespace and some punctutation will be collapsed
52
56
fn next ( & self , cat : SentenceCat ) -> SentenceBreaksState {
53
57
let & SentenceBreaksState ( parts) = self ;
54
58
let parts = match ( parts[ 3 ] , cat) {
@@ -85,27 +89,28 @@ mod fwd {
85
89
] )
86
90
}
87
91
92
+ // Helper function to check if state head matches a single `StatePart`
88
93
fn match1 ( & self , part : StatePart ) -> bool {
89
94
let & SentenceBreaksState ( parts) = self ;
90
95
part == parts[ 3 ]
91
96
}
92
97
98
+ // Helper function to check if first two `StateParts` in state match
99
+ // the given two
93
100
fn match2 ( & self , part1 : StatePart , part2 : StatePart ) -> bool {
94
101
let & SentenceBreaksState ( parts) = self ;
95
102
part1 == parts[ 2 ] && part2 == parts[ 3 ]
96
103
}
97
104
}
98
105
106
+ // https://unicode.org/reports/tr29/#SB8
107
+ // TODO cache this, it is currently quadratic
99
108
fn match_sb8 ( state : & SentenceBreaksState , ahead : & str ) -> bool {
100
- let aterm_part = {
101
- // ATerm Close* Sp*
102
- let & SentenceBreaksState ( parts) = state;
103
- let mut idx = if parts[ 3 ] == StatePart :: SpPlus { 2 } else { 3 } ;
104
- if parts[ idx] == StatePart :: ClosePlus { idx -= 1 }
105
- parts[ idx]
106
- } ;
109
+ let & SentenceBreaksState ( parts) = state;
110
+ let mut idx = if parts[ 3 ] == StatePart :: SpPlus { 2 } else { 3 } ;
111
+ if parts[ idx] == StatePart :: ClosePlus { idx -= 1 }
107
112
108
- if aterm_part == StatePart :: ATerm {
113
+ if parts [ idx ] == StatePart :: ATerm {
109
114
use tables:: sentence as se;
110
115
111
116
for next_char in ahead. chars ( ) {
@@ -124,6 +129,7 @@ mod fwd {
124
129
false
125
130
}
126
131
132
+ // https://unicode.org/reports/tr29/#SB8a
127
133
fn match_sb8a ( state : & SentenceBreaksState ) -> bool {
128
134
// SATerm Close* Sp*
129
135
let & SentenceBreaksState ( parts) = state;
@@ -132,13 +138,15 @@ mod fwd {
132
138
parts[ idx] == StatePart :: STerm || parts[ idx] == StatePart :: ATerm
133
139
}
134
140
141
+ // https://unicode.org/reports/tr29/#SB9
135
142
fn match_sb9 ( state : & SentenceBreaksState ) -> bool {
136
143
// SATerm Close*
137
144
let & SentenceBreaksState ( parts) = state;
138
145
let idx = if parts[ 3 ] == StatePart :: ClosePlus { 2 } else { 3 } ;
139
146
parts[ idx] == StatePart :: STerm || parts[ idx] == StatePart :: ATerm
140
147
}
141
148
149
+ // https://unicode.org/reports/tr29/#SB11
142
150
fn match_sb11 ( state : & SentenceBreaksState ) -> bool {
143
151
// SATerm Close* Sp* ParaSep?
144
152
let & SentenceBreaksState ( parts) = state;
@@ -180,67 +188,69 @@ mod fwd {
180
188
self . state = self . state . next ( next_cat) ;
181
189
182
190
match next_cat {
183
- // SB1
191
+ // SB1 https://unicode.org/reports/tr29/#SB1
184
192
_ if state_before. match1 ( StatePart :: Sot ) =>
185
193
return Some ( position_before) ,
186
194
187
- // SB3
195
+ // SB2 is handled when inner iterator (chars) is finished
196
+
197
+ // SB3 https://unicode.org/reports/tr29/#SB3
188
198
SentenceCat :: SC_LF if state_before. match1 ( StatePart :: CR ) =>
189
199
continue ,
190
200
191
- // SB4
201
+ // SB4 https://unicode.org/reports/tr29/#SB4
192
202
_ if state_before. match1 ( StatePart :: Sep )
193
203
|| state_before. match1 ( StatePart :: CR )
194
204
|| state_before. match1 ( StatePart :: LF )
195
205
=> return Some ( position_before) ,
196
206
197
- // SB5
207
+ // SB5 https://unicode.org/reports/tr29/#SB5
198
208
SentenceCat :: SC_Extend |
199
209
SentenceCat :: SC_Format => self . state = state_before,
200
210
201
- // SB6
211
+ // SB6 https://unicode.org/reports/tr29/#SB6
202
212
SentenceCat :: SC_Numeric if state_before. match1 ( StatePart :: ATerm ) =>
203
213
continue ,
204
214
205
- // SB7
215
+ // SB7 https://unicode.org/reports/tr29/#SB7
206
216
SentenceCat :: SC_Upper if state_before. match2 ( StatePart :: UpperLower , StatePart :: ATerm ) =>
207
217
continue ,
208
218
209
- // SB8
219
+ // SB8 https://unicode.org/reports/tr29/#SB8
210
220
_ if match_sb8 ( & state_before, & self . string [ position_before..] ) =>
211
221
continue ,
212
222
213
- // SB8a
223
+ // SB8a https://unicode.org/reports/tr29/#SB8a
214
224
SentenceCat :: SC_SContinue |
215
225
SentenceCat :: SC_STerm |
216
226
SentenceCat :: SC_ATerm if match_sb8a ( & state_before) =>
217
227
continue ,
218
228
219
- // SB9
229
+ // SB9 https://unicode.org/reports/tr29/#SB9
220
230
SentenceCat :: SC_Close |
221
231
SentenceCat :: SC_Sp |
222
232
SentenceCat :: SC_Sep |
223
233
SentenceCat :: SC_CR |
224
234
SentenceCat :: SC_LF if match_sb9 ( & state_before) =>
225
235
continue ,
226
236
227
- // SB10
237
+ // SB10 https://unicode.org/reports/tr29/#SB10
228
238
SentenceCat :: SC_Sp |
229
239
SentenceCat :: SC_Sep |
230
240
SentenceCat :: SC_CR |
231
241
SentenceCat :: SC_LF if match_sb8a ( & state_before) =>
232
242
continue ,
233
243
234
- // SB11
244
+ // SB11 https://unicode.org/reports/tr29/#SB11
235
245
_ if match_sb11 ( & state_before) =>
236
246
return Some ( position_before) ,
237
247
238
- // SB998
248
+ // SB998 https://unicode.org/reports/tr29/#SB998
239
249
_ => continue
240
250
}
241
251
}
242
252
243
- // SB2
253
+ // SB2 https://unicode.org/reports/tr29/#SB2
244
254
if self . state . match1 ( StatePart :: Sot ) {
245
255
None
246
256
} else if self . state . match1 ( StatePart :: Eot ) {
0 commit comments