forked from promeG/TinyPinyin
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
使用AhoCorasickDoubleArrayTrie实现高效的多词匹配
- Loading branch information
Showing
11 changed files
with
345 additions
and
91 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
42 changes: 42 additions & 0 deletions
42
lib/src/main/java/com/github/promeg/pinyinhelper/ForwardLongestSelector.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package com.github.promeg.pinyinhelper; | ||
|
||
import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.List; | ||
|
||
/** | ||
* 正向最大匹配 | ||
* | ||
* Created by guyacong on 2016/12/28. | ||
*/ | ||
|
||
final class ForwardLongestSelector implements SegmentationSelector<String[]> { | ||
|
||
static final Engine.HitComparator HIT_COMPARATOR = new Engine.HitComparator(); | ||
|
||
@Override | ||
public List<AhoCorasickDoubleArrayTrie<String[]>.Hit<String[]>> select( | ||
final List<AhoCorasickDoubleArrayTrie<String[]>.Hit<String[]>> hits) { | ||
if (hits == null) { | ||
return hits; | ||
} | ||
|
||
List<AhoCorasickDoubleArrayTrie<String[]>.Hit<String[]>> results = new ArrayList<AhoCorasickDoubleArrayTrie<java.lang.String[]>.Hit<String[]>>(hits); | ||
|
||
Collections.sort(hits, HIT_COMPARATOR); | ||
|
||
int endValueToRemove = -1; | ||
|
||
for (AhoCorasickDoubleArrayTrie.Hit hit : hits) { | ||
if (hit.begin > endValueToRemove && hit.end > endValueToRemove) { | ||
endValueToRemove = hit.end; | ||
} else { | ||
results.remove(hit); | ||
} | ||
} | ||
|
||
return results; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
15 changes: 15 additions & 0 deletions
15
lib/src/main/java/com/github/promeg/pinyinhelper/SegmentationSelector.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
package com.github.promeg.pinyinhelper; | ||
|
||
import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie; | ||
|
||
import java.util.List; | ||
|
||
/** | ||
* 分词选择算法应实现的接口 | ||
* | ||
* Created by guyacong on 2016/12/28. | ||
*/ | ||
|
||
public interface SegmentationSelector<T> { | ||
List<AhoCorasickDoubleArrayTrie<T>.Hit<T>> select(List<AhoCorasickDoubleArrayTrie<T>.Hit<T>> hits); | ||
} |
39 changes: 39 additions & 0 deletions
39
lib/src/main/java/com/github/promeg/pinyinhelper/Utils.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
package com.github.promeg.pinyinhelper; | ||
|
||
import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie; | ||
|
||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
/** | ||
* Created by guyacong on 2016/12/28. | ||
*/ | ||
|
||
final class Utils { | ||
|
||
private Utils() { | ||
//no instance | ||
} | ||
|
||
static AhoCorasickDoubleArrayTrie<String[]> dictsToTrie(List<PinyinDict> pinyinDicts) { | ||
Map<String, String[]> all = new HashMap<String, String[]>(); | ||
|
||
if (pinyinDicts != null) { | ||
for (int i = pinyinDicts.size() - 1; i >= 0; i--) { | ||
PinyinDict dict = pinyinDicts.get(i); | ||
if (dict != null && dict.mapping() != null) { | ||
all.putAll(dict.mapping()); | ||
} | ||
} | ||
if (all.size() > 0) { | ||
AhoCorasickDoubleArrayTrie<String[]> trie | ||
= new AhoCorasickDoubleArrayTrie<String[]>(); | ||
trie.build(all); | ||
return trie; | ||
} | ||
} | ||
|
||
return null; | ||
} | ||
} |
Oops, something went wrong.