forked from FrankKwok/Oreo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIcuIteratorWrapper.java
404 lines (380 loc) · 16.2 KB
/
IcuIteratorWrapper.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
/*
* Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
* (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
*
* The original version of this source code and documentation
* is copyrighted and owned by Taligent, Inc., a wholly-owned
* subsidiary of IBM. These materials are provided under terms
* of a License Agreement between Taligent and Sun. This technology
* is protected by multiple US and International patents.
*
* This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*/
package java.text;
/**
* <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
*
* <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
* and <i>regular expressions.</i></p>
*
* <p>A substitution rule defines a name that can be used in place of an expression. It
* consists of a name, which is a string of characters contained in angle brackets, an equals
* sign, and an expression. (There can be no whitespace on either side of the equals sign.)
* To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
* square brackets. A substitution is visible after its definition, and is filled in using
* simple textual substitution. Substitution definitions can contain other substitutions, as
* long as those substitutions have been defined first. Substitutions are generally used to
* make the regular expressions (which can get quite complex) shorted and easier to read.
* They typically define either character categories or commonly-used subexpressions.</p>
*
* <p>There is one special substitution. If the description defines a substitution
* called "<ignore>", the expression must be a [] expression, and the
* expression defines a set of characters (the "<em>ignore characters</em>") that
* will be transparent to the BreakIterator. A sequence of characters will break the
* same way it would if any ignore characters it contains are taken out. Break
* positions never occur befoer ignore characters.</p>
*
* <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
* defines a sequence of characters to be kept together. With one significant exception, the
* iterator uses a longest-possible-match algorithm when matching text to regular
* expressions. The iterator also treats descriptions containing multiple regular expressions
* as if they were ORed together (i.e., as if they were separated by |).</p>
*
* <p>The special characters recognized by the regular-expression parser are as follows:</p>
*
* <blockquote>
* <table border="1" width="100%">
* <tr>
* <td width="6%">*</td>
* <td width="94%">Specifies that the expression preceding the asterisk may occur any number
* of times (including not at all).</td>
* </tr>
* <tr>
* <td width="6%">{}</td>
* <td width="94%">Encloses a sequence of characters that is optional.</td>
* </tr>
* <tr>
* <td width="6%">()</td>
* <td width="94%">Encloses a sequence of characters. If followed by *, the sequence
* repeats. Otherwise, the parentheses are just a grouping device and a way to delimit
* the ends of expressions containing |.</td>
* </tr>
* <tr>
* <td width="6%">|</td>
* <td width="94%">Separates two alternative sequences of characters. Either one
* sequence or the other, but not both, matches this expression. The | character can
* only occur inside ().</td>
* </tr>
* <tr>
* <td width="6%">.</td>
* <td width="94%">Matches any character.</td>
* </tr>
* <tr>
* <td width="6%">*?</td>
* <td width="94%">Specifies a non-greedy asterisk. *? works the same way as *, except
* when there is overlap between the last group of characters in the expression preceding the
* * and the first group of characters following the *. When there is this kind of
* overlap, * will match the longest sequence of characters that match the expression before
* the *, and *? will match the shortest sequence of characters matching the expression
* before the *?. For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text,
* "x[xy]*x" will match through to the last x (i.e., "<strong>xxyxyyyxyxyxxyxyx</strong>yy",
* but "x[xy]*?x" will only match the first two xes ("<strong>xx</strong>yxyyyxyxyxxyxyxyy").</td>
* </tr>
* <tr>
* <td width="6%">[]</td>
* <td width="94%">Specifies a group of alternative characters. A [] expression will
* match any single character that is specified in the [] expression. For more on the
* syntax of [] expressions, see below.</td>
* </tr>
* <tr>
* <td width="6%">/</td>
* <td width="94%">Specifies where the break position should go if text matches this
* expression. (e.g., "[a-z]*/[:Zs:]*[1-0]" will match if the iterator sees a
* run
* of letters, followed by a run of whitespace, followed by a digit, but the break position
* will actually go before the whitespace). Expressions that don't contain / put the
* break position at the end of the matching text.</td>
* </tr>
* <tr>
* <td width="6%">\</td>
* <td width="94%">Escape character. The \ itself is ignored, but causes the next
* character to be treated as literal character. This has no effect for many
* characters, but for the characters listed above, this deprives them of their special
* meaning. (There are no special escape sequences for Unicode characters, or tabs and
* newlines; these are all handled by a higher-level protocol. In a Java string,
* "\n" will be converted to a literal newline character by the time the
* regular-expression parser sees it. Of course, this means that \ sequences that are
* visible to the regexp parser must be written as \\ when inside a Java string.) All
* characters in the ASCII range except for letters, digits, and control characters are
* reserved characters to the parser and must be preceded by \ even if they currently don't
* mean anything.</td>
* </tr>
* <tr>
* <td width="6%">!</td>
* <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
* parser that this expression specifies the backwards-iteration behavior of the iterator,
* and not its normal iteration behavior. This is generally only used in situations
* where the automatically-generated backwards-iteration brhavior doesn't produce
* satisfactory results and must be supplemented with extra client-specified rules.</td>
* </tr>
* <tr>
* <td width="6%"><em>(all others)</em></td>
* <td width="94%">All other characters are treated as literal characters, which must match
* the corresponding character(s) in the text exactly.</td>
* </tr>
* </table>
* </blockquote>
*
* <p>Within a [] expression, a number of other special characters can be used to specify
* groups of characters:</p>
*
* <blockquote>
* <table border="1" width="100%">
* <tr>
* <td width="6%">-</td>
* <td width="94%">Specifies a range of matching characters. For example
* "[a-p]" matches all lowercase Latin letters from a to p (inclusive). The -
* sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
* language's alphabetical order: "[a-z]" doesn't include capital letters, nor does
* it include accented letters such as a-umlaut.</td>
* </tr>
* <tr>
* <td width="6%">::</td>
* <td width="94%">A pair of colons containing a one- or two-letter code matches all
* characters in the corresponding Unicode category. The two-letter codes are the same
* as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]"
* matches all currency symbols and all math symbols). Specifying a one-letter code is
* the same as specifying all two-letter codes that begin with that letter (for example,
* "[:L:]" matches all letters, and is equivalent to
* "[:Lu::Ll::Lo::Lm::Lt:]"). Anything other than a valid two-letter Unicode
* category code or a single letter that begins a Unicode category code is illegal within
* colons.</td>
* </tr>
* <tr>
* <td width="6%">[]</td>
* <td width="94%">[] expressions can nest. This has no effect, except when used in
* conjunction with the ^ token.</td>
* </tr>
* <tr>
* <td width="6%">^</td>
* <td width="94%">Excludes the character (or the characters in the [] expression) following
* it from the group of characters. For example, "[a-z^p]" matches all Latin
* lowercase letters except p. "[:L:^[\u4e00-\u9fff]]" matches all letters
* except the Han ideographs.</td>
* </tr>
* <tr>
* <td width="6%"><em>(all others)</em></td>
* <td width="94%">All other characters are treated as literal characters. (For
* example, "[aeiou]" specifies just the letters a, e, i, o, and u.)</td>
* </tr>
* </table>
* </blockquote>
*
* <p>For a more complete explanation, see <a
* href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>.
* For examples, see the resource data (which is annotated).</p>
*
* @author Richard Gillam
*/
class IcuIteratorWrapper extends BreakIterator {
/* The wrapped ICU implementation. Non-final for #clone() */
private android.icu.text.BreakIterator wrapped;
/**
* Constructs a IcuIteratorWrapper according to the datafile
* provided.
*/
IcuIteratorWrapper(android.icu.text.BreakIterator iterator) {
wrapped = iterator;
}
/**
* Clones this iterator.
*
* @return A newly-constructed IcuIteratorWrapper with the same
* behavior as this one.
*/
public Object clone() {
IcuIteratorWrapper result = (IcuIteratorWrapper) super.clone();
result.wrapped = (android.icu.text.BreakIterator) wrapped.clone();
return result;
}
/**
* Returns true if both BreakIterators are of the same class, have the same
* rules, and iterate over the same text.
*/
public boolean equals(Object that) {
if (!(that instanceof IcuIteratorWrapper)) {
return false;
}
return wrapped.equals(((IcuIteratorWrapper) that).wrapped);
}
//=======================================================================
// BreakIterator overrides
//=======================================================================
/**
* Returns text
*/
public String toString() {
return wrapped.toString();
}
/**
* Compute a hashcode for this BreakIterator
*
* @return A hash code
*/
public int hashCode() {
return wrapped.hashCode();
}
/**
* Sets the current iteration position to the beginning of the text.
* (i.e., the CharacterIterator's starting offset).
*
* @return The offset of the beginning of the text.
*/
public int first() {
return wrapped.first();
}
/**
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
*
* @return The text's past-the-end offset.
*/
public int last() {
return wrapped.last();
}
/**
* Advances the iterator either forward or backward the specified number of steps.
* Negative values move backward, and positive values move forward. This is
* equivalent to repeatedly calling next() or previous().
*
* @param n The number of steps to move. The sign indicates the direction
* (negative is backwards, and positive is forwards).
* @return The character offset of the boundary position n boundaries away from
* the current one.
*/
public int next(int n) {
return wrapped.next(n);
}
/**
* Advances the iterator to the next boundary position.
*
* @return The position of the first boundary after this one.
*/
public int next() {
return wrapped.next();
}
/**
* Advances the iterator backwards, to the last boundary preceding this one.
*
* @return The position of the last boundary position preceding this one.
*/
public int previous() {
return wrapped.previous();
}
/**
* Throw IllegalArgumentException unless begin <= offset < end.
*/
protected static final void checkOffset(int offset, CharacterIterator text) {
if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
throw new IllegalArgumentException("offset out of bounds");
}
}
/**
* Sets the iterator to refer to the first boundary position following
* the specified position.
*
* @return The position of the first break after the current position.
* @offset The position from which to begin searching for a break position.
*/
public int following(int offset) {
CharacterIterator text = getText();
checkOffset(offset, text);
return wrapped.following(offset);
}
/**
* Sets the iterator to refer to the last boundary position before the
* specified position.
*
* @return The position of the last boundary before the starting position.
* @offset The position to begin searching for a break from.
*/
public int preceding(int offset) {
// if we start by updating the current iteration position to the
// position specified by the caller, we can just use previous()
// to carry out this operation
CharacterIterator text = getText();
checkOffset(offset, text);
return wrapped.preceding(offset);
}
/**
* Returns true if the specfied position is a boundary position. As a side
* effect, leaves the iterator pointing to the first boundary position at
* or after "offset".
*
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
*/
public boolean isBoundary(int offset) {
CharacterIterator text = getText();
checkOffset(offset, text);
return wrapped.isBoundary(offset);
}
/**
* Returns the current iteration position.
*
* @return The current iteration position.
*/
public int current() {
return wrapped.current();
}
/**
* Return a CharacterIterator over the text being analyzed. This version
* of this method returns the actual CharacterIterator we're using internally.
* Changing the state of this iterator can have undefined consequences. If
* you need to change it, clone it first.
*
* @return An iterator over the text being analyzed.
*/
public CharacterIterator getText() {
return wrapped.getText();
}
public void setText(String newText) {
wrapped.setText(newText);
}
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
*
* @param newText An iterator over the text to analyze.
*/
public void setText(CharacterIterator newText) {
newText.current();
wrapped.setText(newText);
}
}