Skip to content

Commit

Permalink
使用AhoCorasickDoubleArrayTrie实现高效的多词匹配
Browse files Browse the repository at this point in the history
  • Loading branch information
YacongGu committed Dec 28, 2016
1 parent a0fd381 commit c607682
Show file tree
Hide file tree
Showing 11 changed files with 345 additions and 91 deletions.
1 change: 1 addition & 0 deletions lib/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ dependencies {
compile 'com.belerweb:pinyin4j:2.5.0'
compile 'org.openjdk.jmh:jmh-core:1.3.3'
compile 'org.openjdk.jmh:jmh-generator-annprocess:1.3.3'
compile group: 'com.hankcs', name: 'aho-corasick-double-array-trie', version: '1.0.1'
}

// custom tasks for creating source/javadoc jars
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@
/**
* Created by guyacong on 2016/12/23.
*/

//CHECKSTYLE:OFF
public class PinyinDictBenchmark {
static Random random = new Random();
static HanyuPinyinOutputFormat format;
static Pinyin pinyin = Pinyin.with(CnCityDict.getInstance()).build();

static {
format = new HanyuPinyinOutputFormat();
Expand All @@ -36,7 +37,7 @@ public void measureMy_toPinyin_no_dict() {

@Benchmark
public void measureMy_toPinyin_one_dict() {
Pinyin.with(CnCityDict.getInstance()).build().toPinyin(genRandomString(), ",");
pinyin.toPinyin(genRandomString(), ",");
}


Expand Down Expand Up @@ -101,3 +102,4 @@ private static String decode(String unicodeStr) {
}

}
//CHECKSTYLE:ON
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
package com.github.promeg.pinyinhelper;


import org.openjdk.jmh.annotations.Benchmark;
import net.sourceforge.pinyin4j.PinyinHelper;

import java.util.Random;

/**
* Created by guyacong on 2015/9/28.
*/
//CHECKSTYLE:OFF
public class PinyinSampleBenchmark {

@Benchmark
//@Benchmark
public void measureMyIsChinese() {
Pinyin.isChinese(genRandomChar());
}

@Benchmark
//@Benchmark
public void measurePinyin4jIsChinese() {
isChinesePinyin4j(genRandomChar());
}

@Benchmark
//@Benchmark
public void measureMyToPinyin() {
Pinyin.toPinyin(genRandomChar());
}

@Benchmark
//@Benchmark
public void measurePinyin4jToPinyin() {
PinyinHelper.toHanyuPinyinStringArray(genRandomChar());
}
Expand All @@ -45,3 +45,4 @@ private char genRandomChar() {
return (char) (Character.MIN_VALUE + random.nextInt(Character.MAX_VALUE - Character.MIN_VALUE));
}
}
//CHECKSTYLE:ON
100 changes: 40 additions & 60 deletions lib/src/main/java/com/github/promeg/pinyinhelper/Engine.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package com.github.promeg.pinyinhelper;

import java.util.ArrayList;
import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie;

import java.util.Collections;
import java.util.Comparator;
import java.util.List;

/**
Expand All @@ -9,16 +12,17 @@

final class Engine {

static final HitComparator HIT_COMPARATOR = new HitComparator();

private Engine() {
//no instance
}

//取词的最大长度,必须大于0
static final int WORD_MAX_LENGTH = 6;
static String toPinyin(final String inputStr, final AhoCorasickDoubleArrayTrie<String[]> trie,
final String separator, final SegmentationSelector<String[]> selector) {

public static String toPinyin(String inputStr, List<PinyinDict> pinyinDictSet, String separator) {
if (pinyinDictSet == null || pinyinDictSet.size() == 0) {
// 没有提供字典,按单字符转换输出
if (trie == null || trie.size() == 0 || selector == null) {
// 没有提供字典或选择器,按单字符转换输出
StringBuffer resultPinyinStrBuf = new StringBuffer();
for (int i = 0; i < inputStr.length(); i++) {
resultPinyinStrBuf.append(Pinyin.toPinyin(inputStr.charAt(i)));
Expand All @@ -29,81 +33,57 @@ public static String toPinyin(String inputStr, List<PinyinDict> pinyinDictSet, S
return resultPinyinStrBuf.toString();
}

List<String> segWords = new ArrayList<String>();

String word;
int wordLength;
int position;
int segLength = 0;

// 开始分词,循环以下操作,直到全部完成
while (segLength < inputStr.length()) {
if ((inputStr.length() - segLength) < WORD_MAX_LENGTH) {
wordLength = inputStr.length() - segLength;
} else {
wordLength = WORD_MAX_LENGTH;
}

position = segLength;
word = inputStr.substring(position, position + wordLength);

while (!dictSetContains(word, pinyinDictSet)) {
if (word.length() == 1) {
break;
}

word = word.substring(0, word.length() - 1);
}
List<AhoCorasickDoubleArrayTrie<String[]>.Hit<String[]>> selectedHits = selector.select(trie.parseText(inputStr));

segWords.add(word);
segLength += word.length();
}
Collections.sort(selectedHits, HIT_COMPARATOR);

StringBuffer resultPinyinStrBuf = new StringBuffer();
for (int i = 0; i < segWords.size(); i++) {
String wordStr = segWords.get(i);

if (wordStr.length() == 1) {
resultPinyinStrBuf.append(Pinyin.toPinyin(wordStr.charAt(0)));
} else {
String[] fromDicts = pinyinFromDict(wordStr, pinyinDictSet);
int nextHitIndex = 0;

for (int i = 0; i < inputStr.length();) {
// 首先确认是否有以第i个字符作为begin的hit
if (nextHitIndex < selectedHits.size() && i == selectedHits.get(nextHitIndex).begin) {
// 有以第i个字符作为begin的hit
String[] fromDicts = selectedHits.get(nextHitIndex).value;
for (int j = 0; j < fromDicts.length; j++) {
resultPinyinStrBuf.append(fromDicts[j].toUpperCase());
if (j != fromDicts.length - 1) {
resultPinyinStrBuf.append(separator);
}
}

i = i + (selectedHits.get(nextHitIndex).end - selectedHits.get(nextHitIndex).begin);
nextHitIndex++;
} else {
// 将第i个字符转为拼音
resultPinyinStrBuf.append(Pinyin.toPinyin(inputStr.charAt(i)));
i++;
}

if (i != segWords.size() - 1) {
if (i != inputStr.length()) {
resultPinyinStrBuf.append(separator);
}
}

return resultPinyinStrBuf.toString();
}

static boolean dictSetContains(String word, List<PinyinDict> pinyinDictSet) {
if (pinyinDictSet != null) {
for (PinyinDict dict : pinyinDictSet) {
if (dict != null && dict.mapping() != null
&& dict.mapping().containsKey(word)) {
return true;
}
}
}
return false;
}
static final class HitComparator implements Comparator<AhoCorasickDoubleArrayTrie<java.lang.String[]>.Hit<String[]>> {

static String[] pinyinFromDict(String wordInDict, List<PinyinDict> pinyinDictSet) {
if (pinyinDictSet != null) {
for (PinyinDict dict : pinyinDictSet) {
if (dict != null && dict.mapping() != null
&& dict.mapping().containsKey(wordInDict)) {
return dict.mapping().get(wordInDict);
}
@Override
public int compare(AhoCorasickDoubleArrayTrie<String[]>.Hit<String[]> o1,
AhoCorasickDoubleArrayTrie<String[]>.Hit<String[]> o2) {
if (o1.begin == o2.begin) {
// 起点相同时,更长的排前面
int o1Length = o1.end - o1.begin;
int o2Length = o2.end - o2.begin;
return (o1Length < o2Length) ? 1 : ((o1Length == o2Length) ? 0 : -1);
} else {
// 起点小的放前面
return (o1.begin < o2.begin) ? -1 : ((o1.begin == o2.begin) ? 0 : 1);
}
}
throw new IllegalArgumentException("No pinyin dict contains word: " + wordInDict);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package com.github.promeg.pinyinhelper;

import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
* 正向最大匹配
*
* Created by guyacong on 2016/12/28.
*/

final class ForwardLongestSelector implements SegmentationSelector<String[]> {

static final Engine.HitComparator HIT_COMPARATOR = new Engine.HitComparator();

@Override
public List<AhoCorasickDoubleArrayTrie<String[]>.Hit<String[]>> select(
final List<AhoCorasickDoubleArrayTrie<String[]>.Hit<String[]>> hits) {
if (hits == null) {
return hits;
}

List<AhoCorasickDoubleArrayTrie<String[]>.Hit<String[]>> results = new ArrayList<AhoCorasickDoubleArrayTrie<java.lang.String[]>.Hit<String[]>>(hits);

Collections.sort(hits, HIT_COMPARATOR);

int endValueToRemove = -1;

for (AhoCorasickDoubleArrayTrie.Hit hit : hits) {
if (hit.begin > endValueToRemove && hit.end > endValueToRemove) {
endValueToRemove = hit.end;
} else {
results.remove(hit);
}
}

return results;
}
}
26 changes: 20 additions & 6 deletions lib/src/main/java/com/github/promeg/pinyinhelper/Pinyin.java
Original file line number Diff line number Diff line change
@@ -1,26 +1,29 @@
package com.github.promeg.pinyinhelper;

import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
* Created by guyacong on 2015/9/28.
*/
public final class Pinyin {

final List<PinyinDict> mPinyinDicts;
final AhoCorasickDoubleArrayTrie<String[]> mTrieDict;
final SegmentationSelector mSelector;

private Pinyin(List<PinyinDict> pinyinDicts) {
mPinyinDicts = Collections.unmodifiableList(pinyinDicts);
private Pinyin(List<PinyinDict> pinyinDicts, SegmentationSelector selector) {
mTrieDict = Utils.dictsToTrie(pinyinDicts);
mSelector = selector;
}

public static Builder with(PinyinDict dict) {
return new Builder(dict);
}

public String toPinyin(String str, String separator) {
return Engine.toPinyin(str, mPinyinDicts, separator);
return Engine.toPinyin(str, mTrieDict, separator, mSelector);
}

/**
Expand Down Expand Up @@ -76,6 +79,8 @@ private static short decodeIndex(byte[] paddings, byte[] indexes, int offset) {

public static final class Builder {

SegmentationSelector mSelector = null;

List<PinyinDict> mPinyinDicts = null;

private Builder(PinyinDict dict) {
Expand All @@ -92,8 +97,17 @@ public Builder with(PinyinDict dict) {
return this;
}

// 暂不公开此API
/*public*/ Builder selector(SegmentationSelector selector) {
if (selector != null) {
mSelector = selector;
}
return this;
}

public Pinyin build() {
return new Pinyin(mPinyinDicts);
// mSelector为null时,默认使用ForwardLongestSelector
return new Pinyin(mPinyinDicts, mSelector == null ? new ForwardLongestSelector() : mSelector);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.github.promeg.pinyinhelper;

import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie;

import java.util.List;

/**
* 分词选择算法应实现的接口
*
* Created by guyacong on 2016/12/28.
*/

public interface SegmentationSelector<T> {
List<AhoCorasickDoubleArrayTrie<T>.Hit<T>> select(List<AhoCorasickDoubleArrayTrie<T>.Hit<T>> hits);
}
39 changes: 39 additions & 0 deletions lib/src/main/java/com/github/promeg/pinyinhelper/Utils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package com.github.promeg.pinyinhelper;

import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* Created by guyacong on 2016/12/28.
*/

final class Utils {

private Utils() {
//no instance
}

static AhoCorasickDoubleArrayTrie<String[]> dictsToTrie(List<PinyinDict> pinyinDicts) {
Map<String, String[]> all = new HashMap<String, String[]>();

if (pinyinDicts != null) {
for (int i = pinyinDicts.size() - 1; i >= 0; i--) {
PinyinDict dict = pinyinDicts.get(i);
if (dict != null && dict.mapping() != null) {
all.putAll(dict.mapping());
}
}
if (all.size() > 0) {
AhoCorasickDoubleArrayTrie<String[]> trie
= new AhoCorasickDoubleArrayTrie<String[]>();
trie.build(all);
return trie;
}
}

return null;
}
}
Loading

0 comments on commit c607682

Please sign in to comment.