Skip to content

Commit

Permalink
添加词典功能
Browse files Browse the repository at this point in the history
  • Loading branch information
YacongGu committed Dec 24, 2016
1 parent 7f2157b commit a0fd381
Show file tree
Hide file tree
Showing 31 changed files with 11,292 additions and 23 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ before_script:
script:
- echo "Travis branch is $TRAVIS_BRANCH"
- echo "Travis branch is in pull request $TRAVIS_PULL+REQUEST"
- ./gradlew :lib:test
- ./gradlew :lib:test :tinypinyin-lexicons-android-cncity:test
14 changes: 11 additions & 3 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ buildscript {
jcenter()
}
dependencies {
classpath 'com.jfrog.bintray.gradle:gradle-bintray-plugin:1.1'
classpath 'com.github.dcendents:android-maven-plugin:1.2'
classpath 'com.jfrog.bintray.gradle:gradle-bintray-plugin:1.7.1'
classpath 'com.github.dcendents:android-maven-gradle-plugin:1.5'
}
}

Expand All @@ -20,5 +20,13 @@ allprojects {

ext {
groupName = 'com.github.promeg'
releaseVersion = "1.0.0"
releaseVersionName = "1.0.0"
releaseVersionCode = 2

// Android
androidBuildToolsVersion = '24.0.3'
androidMinSdkVersion = 16
androidTargetSdkVersion = 24
androidCompileSdkVersion = 24
androidSupportSdkVersion = '24.2.1'
}
2 changes: 2 additions & 0 deletions config/quality-android.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ task checkstyle(type: Checkstyle, group: 'verification') {
source 'src'
include '**/*.java'
exclude '**/gen/**'
exclude '**/test/**'
exclude '**/androidTest/**'
exclude '**/R.java'
exclude '**/BuildConfig.java'

Expand Down
2 changes: 1 addition & 1 deletion gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-2.2-all.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-2.14.1-all.zip
Empty file modified gradlew
100644 → 100755
Empty file.
54 changes: 41 additions & 13 deletions lib/build.gradle
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
import java.text.SimpleDateFormat

apply plugin: 'java'
apply plugin: 'com.github.johnrengelman.shadow'
apply plugin: 'me.champeau.gradle.jmh'
apply plugin: 'maven-publish'
apply plugin: 'com.jfrog.bintray'
apply from: '../config/quality-java.gradle'

tasks.compileJava.dependsOn('checkstyle')

group = rootProject.ext.groupName
version = rootProject.ext.releaseVersion
version = rootProject.ext.releaseVersionName

buildscript {
repositories {
mavenLocal()
jcenter()
maven {
url "https://plugins.gradle.org/m2/"
}
}

dependencies {
classpath 'com.github.jengelman.gradle.plugins:shadow:1.2.0'
classpath "me.champeau.gradle:jmh-gradle-plugin:0.2.0"
classpath "me.champeau.gradle:jmh-gradle-plugin:0.3.1"
classpath 'org.apache.ant:ant:1.9.7'
}

}
Expand All @@ -30,12 +30,13 @@ targetCompatibility = 1.6

jmh {
jmhVersion = '1.3.3'
include = '.*SampleBenchmark.*'
include = ['*.PinyinDictBenchmark.*']
operationsPerInvocation = 10
benchmarkMode = 'thrpt'
benchmarkMode = ['thrpt']
verbosity = 'NORMAL'
fork = 2
timeUnit = 'us'
duplicateClassesStrategy = 'warn'
}

configurations {
Expand All @@ -51,6 +52,31 @@ project.sourceSets.jmh {
runtimeClasspath += project.configurations.jmh + project.sourceSets.main.output
}

task jmhJarFixed(type: Jar, dependsOn: jmhClasses) {
doFirst {
from (project.configurations.jmh.collect {it.isDirectory() ? it : project.zipTree(it)}) {
exclude '**/META-INF/services/**'
exclude '**/META-INF/*.SF'
exclude '**/META-INF/*.DSA'
exclude '**/META-INF/*.RSA'
}
from project.sourceSets.jmh.output
from project.sourceSets.main.output
duplicatesStrategy(DuplicatesStrategy.EXCLUDE)
}

manifest {
attributes 'Main-Class':'org.openjdk.jmh.Main'
}

classifier = 'jmh'
}

task jmhFixed(type: JavaExec, dependsOn: jmhJarFixed) {
main = 'org.openjdk.jmh.Main'
classpath = project.files(project.jmhJar.archivePath) + project.sourceSets.main.runtimeClasspath
}


dependencies {
testCompile 'junit:junit:4.12'
Expand All @@ -59,9 +85,11 @@ dependencies {
testCompile 'org.mockito:mockito-core:1.10.19'
testCompile 'com.belerweb:pinyin4j:2.5.0'

jmh 'com.belerweb:pinyin4j:2.5.0'
jmh 'org.openjdk.jmh:jmh-core:1.3.3'
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.3.3'
compile fileTree(dir: 'jmh-libs', include: ['*.jar'])
compile 'org.apache.commons:commons-lang3:3.0'
compile 'com.belerweb:pinyin4j:2.5.0'
compile 'org.openjdk.jmh:jmh-core:1.3.3'
compile 'org.openjdk.jmh:jmh-generator-annprocess:1.3.3'
}

// custom tasks for creating source/javadoc jars
Expand Down Expand Up @@ -108,7 +136,7 @@ bintray {
pkg {
repo = 'maven'
name = 'tinypinyin'
desc = '适用于Java和Android的快速、低内存占用的汉字转拼音库。 '
desc = '适用于Java和Android的快速、低内存占用的汉字转拼音库。'
websiteUrl = 'https://github.com/promeG/TinyPinyin'
issueTrackerUrl = 'https://github.com/promeG/TinyPinyin/issues'
vcsUrl = 'https://github.com/promeG/TinyPinyin.git'
Expand All @@ -117,7 +145,7 @@ bintray {
publicDownloadNumbers = true
//Optional version descriptor
version {
name = rootProject.ext.releaseVersion //Bintray logical version name
name = rootProject.ext.releaseVersionName //Bintray logical version name
released = new SimpleDateFormat('yyyy-MM-dd\'T\'HH:mm:ss.SSSZZ').format(new Date())
vcsTag = '1.0.0'
mavenCentralSync {
Expand Down
Binary file added lib/jmh-libs/tinypinyin-lexicons-java-cncity.jar
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package com.github.promeg.pinyinhelper;

import com.github.promeg.tinypinyin.lexicons.java.cncity.CnCityDict;

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

import org.apache.commons.lang3.RandomStringUtils;
import org.openjdk.jmh.annotations.Benchmark;

import java.util.Random;

/**
* Created by guyacong on 2016/12/23.
*/

public class PinyinDictBenchmark {
static Random random = new Random();
static HanyuPinyinOutputFormat format;

static {
format = new HanyuPinyinOutputFormat();
format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
format.setCaseType(HanyuPinyinCaseType.UPPERCASE);
format.setVCharType(HanyuPinyinVCharType.WITH_V);
}

//@Benchmark
public void measureMy_toPinyin_no_dict() {
Pinyin.with(null).build().toPinyin(genRandomString(), ",");
}

@Benchmark
public void measureMy_toPinyin_one_dict() {
Pinyin.with(CnCityDict.getInstance()).build().toPinyin(genRandomString(), ",");
}


//@Benchmark
public void measurePinyin4j_toPinyin() throws BadHanyuPinyinOutputFormatCombination {
PinyinHelper.toHanyuPinyinString(genRandomString(), format, ",");
}

//@Benchmark
public void measureMy_toPinyin_with_dict() {
Pinyin.with(null).build().toPinyin(genRandomString(), ",");
}


private String genRandomString() {
int length = random.nextInt(100);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < length; i++) {
if (random.nextBoolean()) {
sb.append(randomChinese());
} else {
sb.append(RandomStringUtils.randomAscii(1));
}
}
return sb.toString();
}

private static int chineseStart = Integer.parseInt(String.valueOf(0x4e00));
private static int chineseEnd = Integer.parseInt(String.valueOf(0x9FA5));

private static String randomChinese(){
Random random = new Random();
int position = random.nextInt(chineseEnd-chineseStart)+chineseStart;
String code = Integer.toHexString(position);
return decode("\\u"+code);
}

private static String decode(String unicodeStr) {
if (unicodeStr == null) {
return null;
}
StringBuffer sb = new StringBuffer();
int maxLoop = unicodeStr.length();
for (int i = 0; i <maxLoop; i++) {
if (unicodeStr.charAt(i) == '\\') {
if ((i <maxLoop - 5)
&& ((unicodeStr.charAt(i + 1) == 'u') || (
unicodeStr.charAt(i + 1) == 'U')))
try {
sb.append((char) Integer.parseInt(unicodeStr.substring(i + 2, i + 6), 16));
i += 5;
} catch (NumberFormatException localNumberFormatException) {
sb.append(unicodeStr.charAt(i));
}
else
sb.append(unicodeStr.charAt(i));
} else {
sb.append(unicodeStr.charAt(i));
}
}
return sb.toString();
}

}
109 changes: 109 additions & 0 deletions lib/src/main/java/com/github/promeg/pinyinhelper/Engine.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package com.github.promeg.pinyinhelper;

import java.util.ArrayList;
import java.util.List;

/**
* Created by guyacong on 2016/12/23.
*/

final class Engine {

private Engine() {
//no instance
}

//取词的最大长度,必须大于0
static final int WORD_MAX_LENGTH = 6;

public static String toPinyin(String inputStr, List<PinyinDict> pinyinDictSet, String separator) {
if (pinyinDictSet == null || pinyinDictSet.size() == 0) {
// 没有提供字典,按单字符转换输出
StringBuffer resultPinyinStrBuf = new StringBuffer();
for (int i = 0; i < inputStr.length(); i++) {
resultPinyinStrBuf.append(Pinyin.toPinyin(inputStr.charAt(i)));
if (i != inputStr.length() - 1) {
resultPinyinStrBuf.append(separator);
}
}
return resultPinyinStrBuf.toString();
}

List<String> segWords = new ArrayList<String>();

String word;
int wordLength;
int position;
int segLength = 0;

// 开始分词,循环以下操作,直到全部完成
while (segLength < inputStr.length()) {
if ((inputStr.length() - segLength) < WORD_MAX_LENGTH) {
wordLength = inputStr.length() - segLength;
} else {
wordLength = WORD_MAX_LENGTH;
}

position = segLength;
word = inputStr.substring(position, position + wordLength);

while (!dictSetContains(word, pinyinDictSet)) {
if (word.length() == 1) {
break;
}

word = word.substring(0, word.length() - 1);
}

segWords.add(word);
segLength += word.length();
}

StringBuffer resultPinyinStrBuf = new StringBuffer();
for (int i = 0; i < segWords.size(); i++) {
String wordStr = segWords.get(i);

if (wordStr.length() == 1) {
resultPinyinStrBuf.append(Pinyin.toPinyin(wordStr.charAt(0)));
} else {
String[] fromDicts = pinyinFromDict(wordStr, pinyinDictSet);
for (int j = 0; j < fromDicts.length; j++) {
resultPinyinStrBuf.append(fromDicts[j].toUpperCase());
if (j != fromDicts.length - 1) {
resultPinyinStrBuf.append(separator);
}
}
}

if (i != segWords.size() - 1) {
resultPinyinStrBuf.append(separator);
}
}
return resultPinyinStrBuf.toString();
}

static boolean dictSetContains(String word, List<PinyinDict> pinyinDictSet) {
if (pinyinDictSet != null) {
for (PinyinDict dict : pinyinDictSet) {
if (dict != null && dict.mapping() != null
&& dict.mapping().containsKey(word)) {
return true;
}
}
}
return false;
}

static String[] pinyinFromDict(String wordInDict, List<PinyinDict> pinyinDictSet) {
if (pinyinDictSet != null) {
for (PinyinDict dict : pinyinDictSet) {
if (dict != null && dict.mapping() != null
&& dict.mapping().containsKey(wordInDict)) {
return dict.mapping().get(wordInDict);
}
}
}
throw new IllegalArgumentException("No pinyin dict contains word: " + wordInDict);
}

}
Loading

0 comments on commit a0fd381

Please sign in to comment.