-
Notifications
You must be signed in to change notification settings - Fork 477
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
31 changed files
with
11,292 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
103 changes: 103 additions & 0 deletions
103
lib/src/jmh/java/com/github/promeg/pinyinhelper/PinyinDictBenchmark.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
package com.github.promeg.pinyinhelper; | ||
|
||
import com.github.promeg.tinypinyin.lexicons.java.cncity.CnCityDict; | ||
|
||
import net.sourceforge.pinyin4j.PinyinHelper; | ||
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType; | ||
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat; | ||
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType; | ||
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType; | ||
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination; | ||
|
||
import org.apache.commons.lang3.RandomStringUtils; | ||
import org.openjdk.jmh.annotations.Benchmark; | ||
|
||
import java.util.Random; | ||
|
||
/** | ||
* Created by guyacong on 2016/12/23. | ||
*/ | ||
|
||
public class PinyinDictBenchmark { | ||
static Random random = new Random(); | ||
static HanyuPinyinOutputFormat format; | ||
|
||
static { | ||
format = new HanyuPinyinOutputFormat(); | ||
format.setToneType(HanyuPinyinToneType.WITHOUT_TONE); | ||
format.setCaseType(HanyuPinyinCaseType.UPPERCASE); | ||
format.setVCharType(HanyuPinyinVCharType.WITH_V); | ||
} | ||
|
||
//@Benchmark | ||
public void measureMy_toPinyin_no_dict() { | ||
Pinyin.with(null).build().toPinyin(genRandomString(), ","); | ||
} | ||
|
||
@Benchmark | ||
public void measureMy_toPinyin_one_dict() { | ||
Pinyin.with(CnCityDict.getInstance()).build().toPinyin(genRandomString(), ","); | ||
} | ||
|
||
|
||
//@Benchmark | ||
public void measurePinyin4j_toPinyin() throws BadHanyuPinyinOutputFormatCombination { | ||
PinyinHelper.toHanyuPinyinString(genRandomString(), format, ","); | ||
} | ||
|
||
//@Benchmark | ||
public void measureMy_toPinyin_with_dict() { | ||
Pinyin.with(null).build().toPinyin(genRandomString(), ","); | ||
} | ||
|
||
|
||
private String genRandomString() { | ||
int length = random.nextInt(100); | ||
StringBuilder sb = new StringBuilder(); | ||
for (int i = 0; i < length; i++) { | ||
if (random.nextBoolean()) { | ||
sb.append(randomChinese()); | ||
} else { | ||
sb.append(RandomStringUtils.randomAscii(1)); | ||
} | ||
} | ||
return sb.toString(); | ||
} | ||
|
||
private static int chineseStart = Integer.parseInt(String.valueOf(0x4e00)); | ||
private static int chineseEnd = Integer.parseInt(String.valueOf(0x9FA5)); | ||
|
||
private static String randomChinese(){ | ||
Random random = new Random(); | ||
int position = random.nextInt(chineseEnd-chineseStart)+chineseStart; | ||
String code = Integer.toHexString(position); | ||
return decode("\\u"+code); | ||
} | ||
|
||
private static String decode(String unicodeStr) { | ||
if (unicodeStr == null) { | ||
return null; | ||
} | ||
StringBuffer sb = new StringBuffer(); | ||
int maxLoop = unicodeStr.length(); | ||
for (int i = 0; i <maxLoop; i++) { | ||
if (unicodeStr.charAt(i) == '\\') { | ||
if ((i <maxLoop - 5) | ||
&& ((unicodeStr.charAt(i + 1) == 'u') || ( | ||
unicodeStr.charAt(i + 1) == 'U'))) | ||
try { | ||
sb.append((char) Integer.parseInt(unicodeStr.substring(i + 2, i + 6), 16)); | ||
i += 5; | ||
} catch (NumberFormatException localNumberFormatException) { | ||
sb.append(unicodeStr.charAt(i)); | ||
} | ||
else | ||
sb.append(unicodeStr.charAt(i)); | ||
} else { | ||
sb.append(unicodeStr.charAt(i)); | ||
} | ||
} | ||
return sb.toString(); | ||
} | ||
|
||
} |
109 changes: 109 additions & 0 deletions
109
lib/src/main/java/com/github/promeg/pinyinhelper/Engine.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
package com.github.promeg.pinyinhelper; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* Created by guyacong on 2016/12/23. | ||
*/ | ||
|
||
final class Engine { | ||
|
||
private Engine() { | ||
//no instance | ||
} | ||
|
||
//取词的最大长度,必须大于0 | ||
static final int WORD_MAX_LENGTH = 6; | ||
|
||
public static String toPinyin(String inputStr, List<PinyinDict> pinyinDictSet, String separator) { | ||
if (pinyinDictSet == null || pinyinDictSet.size() == 0) { | ||
// 没有提供字典,按单字符转换输出 | ||
StringBuffer resultPinyinStrBuf = new StringBuffer(); | ||
for (int i = 0; i < inputStr.length(); i++) { | ||
resultPinyinStrBuf.append(Pinyin.toPinyin(inputStr.charAt(i))); | ||
if (i != inputStr.length() - 1) { | ||
resultPinyinStrBuf.append(separator); | ||
} | ||
} | ||
return resultPinyinStrBuf.toString(); | ||
} | ||
|
||
List<String> segWords = new ArrayList<String>(); | ||
|
||
String word; | ||
int wordLength; | ||
int position; | ||
int segLength = 0; | ||
|
||
// 开始分词,循环以下操作,直到全部完成 | ||
while (segLength < inputStr.length()) { | ||
if ((inputStr.length() - segLength) < WORD_MAX_LENGTH) { | ||
wordLength = inputStr.length() - segLength; | ||
} else { | ||
wordLength = WORD_MAX_LENGTH; | ||
} | ||
|
||
position = segLength; | ||
word = inputStr.substring(position, position + wordLength); | ||
|
||
while (!dictSetContains(word, pinyinDictSet)) { | ||
if (word.length() == 1) { | ||
break; | ||
} | ||
|
||
word = word.substring(0, word.length() - 1); | ||
} | ||
|
||
segWords.add(word); | ||
segLength += word.length(); | ||
} | ||
|
||
StringBuffer resultPinyinStrBuf = new StringBuffer(); | ||
for (int i = 0; i < segWords.size(); i++) { | ||
String wordStr = segWords.get(i); | ||
|
||
if (wordStr.length() == 1) { | ||
resultPinyinStrBuf.append(Pinyin.toPinyin(wordStr.charAt(0))); | ||
} else { | ||
String[] fromDicts = pinyinFromDict(wordStr, pinyinDictSet); | ||
for (int j = 0; j < fromDicts.length; j++) { | ||
resultPinyinStrBuf.append(fromDicts[j].toUpperCase()); | ||
if (j != fromDicts.length - 1) { | ||
resultPinyinStrBuf.append(separator); | ||
} | ||
} | ||
} | ||
|
||
if (i != segWords.size() - 1) { | ||
resultPinyinStrBuf.append(separator); | ||
} | ||
} | ||
return resultPinyinStrBuf.toString(); | ||
} | ||
|
||
static boolean dictSetContains(String word, List<PinyinDict> pinyinDictSet) { | ||
if (pinyinDictSet != null) { | ||
for (PinyinDict dict : pinyinDictSet) { | ||
if (dict != null && dict.mapping() != null | ||
&& dict.mapping().containsKey(word)) { | ||
return true; | ||
} | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
static String[] pinyinFromDict(String wordInDict, List<PinyinDict> pinyinDictSet) { | ||
if (pinyinDictSet != null) { | ||
for (PinyinDict dict : pinyinDictSet) { | ||
if (dict != null && dict.mapping() != null | ||
&& dict.mapping().containsKey(wordInDict)) { | ||
return dict.mapping().get(wordInDict); | ||
} | ||
} | ||
} | ||
throw new IllegalArgumentException("No pinyin dict contains word: " + wordInDict); | ||
} | ||
|
||
} |
Oops, something went wrong.