-
Notifications
You must be signed in to change notification settings - Fork 0
/
HanziToPinyin.java
188 lines (165 loc) · 6.17 KB
/
HanziToPinyin.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
/*
* Copyright (C) 2011 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.android.providers.contacts;
import android.icu.text.Transliterator;
import android.text.TextUtils;
import android.util.Log;
import java.util.ArrayList;
import java.util.Locale;
/**
* An object to convert Chinese character to its corresponding pinyin string.
* For characters with multiple possible pinyin string, only one is selected
* according to ICU Transliterator class. Polyphone is not supported in this
* implementation.
*/
public class HanziToPinyin {
private static final String TAG = "HanziToPinyin";
private static HanziToPinyin sInstance;
private Transliterator mPinyinTransliterator;
private Transliterator mAsciiTransliterator;
public static class Token {
/**
* Separator between target string for each source char
*/
public static final String SEPARATOR = " ";
public static final int LATIN = 1;
public static final int PINYIN = 2;
public static final int UNKNOWN = 3;
public Token() {
}
public Token(int type, String source, String target) {
this.type = type;
this.source = source;
this.target = target;
}
/**
* Type of this token, ASCII, PINYIN or UNKNOWN.
*/
public int type;
/**
* Original string before translation.
*/
public String source;
/**
* Translated string of source. For Han, target is corresponding Pinyin. Otherwise target is
* original string in source.
*/
public String target;
}
private HanziToPinyin() {
try {
mPinyinTransliterator = Transliterator.getInstance(
"Han-Latin/Names; Latin-Ascii; Any-Upper");
mAsciiTransliterator = Transliterator.getInstance("Latin-Ascii");
} catch (IllegalArgumentException e) {
Log.w(TAG, "Han-Latin/Names transliterator data is missing,"
+ " HanziToPinyin is disabled");
}
}
public boolean hasChineseTransliterator() {
return mPinyinTransliterator != null;
}
public static HanziToPinyin getInstance() {
synchronized (HanziToPinyin.class) {
if (sInstance == null) {
sInstance = new HanziToPinyin();
}
return sInstance;
}
}
private void tokenize(char character, Token token) {
token.source = Character.toString(character);
// ASCII
if (character < 128) {
token.type = Token.LATIN;
token.target = token.source;
return;
}
// Extended Latin. Transcode these to ASCII equivalents
if (character < 0x250 || (0x1e00 <= character && character < 0x1eff)) {
token.type = Token.LATIN;
token.target = mAsciiTransliterator == null ? token.source :
mAsciiTransliterator.transliterate(token.source);
return;
}
token.type = Token.PINYIN;
token.target = mPinyinTransliterator.transliterate(token.source);
if (TextUtils.isEmpty(token.target) ||
TextUtils.equals(token.source, token.target)) {
token.type = Token.UNKNOWN;
token.target = token.source;
}
}
public String transliterate(final String input) {
if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
return null;
}
return mPinyinTransliterator.transliterate(input);
}
/**
* Convert the input to a array of tokens. The sequence of ASCII or Unknown characters without
* space will be put into a Token, One Hanzi character which has pinyin will be treated as a
* Token. If there is no Chinese transliterator, the empty token array is returned.
*/
public ArrayList<Token> getTokens(final String input) {
ArrayList<Token> tokens = new ArrayList<Token>();
if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
// return empty tokens.
return tokens;
}
final int inputLength = input.length();
final StringBuilder sb = new StringBuilder();
int tokenType = Token.LATIN;
Token token = new Token();
// Go through the input, create a new token when
// a. Token type changed
// b. Get the Pinyin of current charater.
// c. current character is space.
for (int i = 0; i < inputLength; i++) {
final char character = input.charAt(i);
if (Character.isSpaceChar(character)) {
if (sb.length() > 0) {
addToken(sb, tokens, tokenType);
}
} else {
tokenize(character, token);
if (token.type == Token.PINYIN) {
if (sb.length() > 0) {
addToken(sb, tokens, tokenType);
}
tokens.add(token);
token = new Token();
} else {
if (tokenType != token.type && sb.length() > 0) {
addToken(sb, tokens, tokenType);
}
sb.append(token.target);
}
tokenType = token.type;
}
}
if (sb.length() > 0) {
addToken(sb, tokens, tokenType);
}
return tokens;
}
private void addToken(
final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) {
String str = sb.toString();
tokens.add(new Token(tokenType, str, str));
sb.setLength(0);
}
}