Skip to content

Commit

Permalink
add parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
medcl committed Dec 1, 2017
1 parent c9cbed6 commit 438d528
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ The plugin includes analyzer: `pinyin` , tokenizer: `pinyin` and token-filter:
* `lowercase` lowercase non Chinese letters, default: true
* `trim_whitespace` default: true
* `remove_duplicated_term` when this option enabled, duplicated term will be removed to save index, eg: `de的`>`de`, default: false, NOTE: position related query maybe influenced
* `ignore_pinyin_offset` after 6.0, offset is strictly constrained, overlapped tokens are not allowed, with this parameter, overlapped token will allowed by ignore offset, please note, all position related query or highlight will become incorrect, you should use multi fields and specify different settings for different query purpose. if you need offset, please set it to false. default: true.



Expand Down
3 changes: 3 additions & 0 deletions src/main/java/org/elasticsearch/analysis/PinyinConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ public class PinyinConfig {
public boolean keepJoinedFullPinyin =false;
public boolean removeDuplicateTerm=false;
public boolean fixedPinyinOffset =false;
// after 6.0, offset is strictly constrained, overlapped tokens are not allowed, with this parameter, overlapped token will allowed by ignore offset, please note, all position related query or highlight will become incorrect, you should use multi fields and specify different settings for different query purpose. if you need offset, please set it to false. default: true.
public boolean ignorePinyinOffset =true;

public PinyinConfig() {}
public PinyinConfig(Settings settings) {
Expand All @@ -40,5 +42,6 @@ public PinyinConfig(Settings settings) {
this.keepNoneChineseInJoinedFullPinyin =settings.getAsBoolean("keep_none_chinese_in_joined_full_pinyin", false);
this.removeDuplicateTerm =settings.getAsBoolean("remove_duplicated_term", false);
this.fixedPinyinOffset =settings.getAsBoolean("fixed_pinyin_offset", false);
this.ignorePinyinOffset =settings.getAsBoolean("ignore_pinyin_offset", true);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,10 @@ void setTerm(String term, int startOffset, int endOffset, int position) {
if (endOffset < startOffset) {
endOffset = startOffset + term.length();
}
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));

if(!config.ignorePinyinOffset){
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
}

int offset = position - lastIncrementPosition;
if (offset < 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ public void testTokenFilter() throws IOException {
config.keepNoneChinese = true;
config.keepOriginal = false;
config.keepFullPinyin = false;
config.ignorePinyinOffset = false;


StringReader sr = new StringReader("刘德华");
Analyzer analyzer = new StandardAnalyzer();
Expand Down Expand Up @@ -87,6 +89,8 @@ public void testTokenFilter() throws IOException {
config.keepNoneChinese = true;
config.keepOriginal = false;
config.keepFullPinyin = true;
config.ignorePinyinOffset = false;


sr = new StringReader("刘德华");
analyzer = new StandardAnalyzer();
Expand All @@ -110,6 +114,8 @@ public void testTokenFilter() throws IOException {
config.keepNoneChinese = true;
config.keepOriginal = true;
config.keepFullPinyin = true;
config.ignorePinyinOffset = false;


sr = new StringReader("刘德华");
analyzer = new StandardAnalyzer();
Expand Down Expand Up @@ -140,6 +146,8 @@ public void testTokenFilter() throws IOException {
config.keepNoneChinese = true;
config.keepOriginal = true;
config.keepFullPinyin = true;
config.ignorePinyinOffset = false;


sr = new StringReader("刘德华");
analyzer = new KeywordAnalyzer();
Expand Down Expand Up @@ -170,6 +178,8 @@ public void testTokenFilter() throws IOException {
config.keepFullPinyin = false;
config.LimitFirstLetterLength = 5;
config.lowercase = true;
config.ignorePinyinOffset = false;


sr = new StringReader("Go的数组是纯粹的值类型,传递一个[N]T的代价是N个T");
analyzer = new KeywordAnalyzer();
Expand All @@ -196,6 +206,8 @@ public void testTokenFilter() throws IOException {
config.keepFullPinyin = true;
config.LimitFirstLetterLength = 5;
config.lowercase = true;
config.ignorePinyinOffset = false;


sr = new StringReader("liu德hua 名字");
analyzer = new WhitespaceAnalyzer();
Expand Down Expand Up @@ -227,6 +239,8 @@ public void testTokenFilter() throws IOException {
config.lowercase = true;
config.noneChinesePinyinTokenize=true;
config.removeDuplicateTerm=false;
config.ignorePinyinOffset = false;


sr = new StringReader("liudehuaalibaba13zhuanghan134");
analyzer = new WhitespaceAnalyzer();
Expand Down Expand Up @@ -264,6 +278,7 @@ public void testTokenFilter() throws IOException {
config.lowercase=true;
config.trimWhitespace=true;
config.fixedPinyinOffset =true;
config.ignorePinyinOffset = false;

sr = new StringReader("刘德华");
analyzer = new WhitespaceAnalyzer();
Expand All @@ -285,6 +300,8 @@ private List<String> getTokenFilterResult(PinyinTokenFilter filter) throws IOEx
PositionIncrementAttribute position = filter.getAttribute(PositionIncrementAttribute.class);
pos=pos+position.getPositionIncrement();
pinyin.add(ta.toString());
Assert.assertTrue("startOffset must be non-negative",offset.startOffset()>=0);
Assert.assertTrue("endOffset must be >= startOffset",offset.startOffset()>=0);
System.out.println(ta.toString()+","+offset.startOffset()+","+offset.endOffset()+","+pos);
}
return pinyin;
Expand All @@ -303,6 +320,8 @@ public void TestTokenizer() throws IOException {
PinyinConfig config = new PinyinConfig();
config.noneChinesePinyinTokenize=false;
config.keepOriginal=true;
config.ignorePinyinOffset = false;

HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);

ArrayList<TermItem> re = result.get("刘德华");
Expand Down Expand Up @@ -392,6 +411,8 @@ public void TestTokenizer() throws IOException {
config.keepFullPinyin = true;
config.LimitFirstLetterLength = 5;
config.lowercase = false;
config.ignorePinyinOffset = false;


result = getStringArrayListHashMap(s1, config);

Expand All @@ -416,6 +437,8 @@ public void TestTokenizer() throws IOException {
config.LimitFirstLetterLength = 5;
config.removeDuplicateTerm = true;
config.lowercase = false;
config.ignorePinyinOffset = false;


result = getStringArrayListHashMap(s1, config);

Expand All @@ -438,6 +461,8 @@ public void TestTokenizer() throws IOException {
config.keepOriginal=false;
config.lowercase=true;
config.trimWhitespace=true;
config.ignorePinyinOffset = false;


result = getStringArrayListHashMap(s1, config);

Expand All @@ -457,6 +482,8 @@ public void TestTokenizer() throws IOException {
config.keepOriginal=false;
config.lowercase=true;
config.trimWhitespace=true;
config.ignorePinyinOffset = false;


result = getStringArrayListHashMap(s1, config);

Expand All @@ -476,6 +503,8 @@ public void TestTokenizer() throws IOException {
config.keepOriginal=false;
config.lowercase=true;
config.trimWhitespace=true;
config.ignorePinyinOffset = false;


result = getStringArrayListHashMap(s1, config);

Expand All @@ -495,6 +524,8 @@ public void TestTokenizer() throws IOException {
config.LimitFirstLetterLength=16;
config.noneChinesePinyinTokenize=true;
config.lowercase=true;
config.ignorePinyinOffset = false;


result = getStringArrayListHashMap(s1, config);

Expand Down Expand Up @@ -522,6 +553,8 @@ public void TestFirstLetters() throws IOException {
config.LimitFirstLetterLength = 16;
config.noneChinesePinyinTokenize = true;
config.lowercase = true;
config.ignorePinyinOffset = false;


HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s1, config);

Expand Down Expand Up @@ -553,6 +586,8 @@ public void TestOnlyLetters() throws IOException {
config.LimitFirstLetterLength=16;
config.noneChinesePinyinTokenize=true;
config.lowercase=true;
config.ignorePinyinOffset = false;


HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s1, config);

Expand Down Expand Up @@ -582,6 +617,8 @@ public void TestOnlyLetters() throws IOException {
config.LimitFirstLetterLength=16;
config.noneChinesePinyinTokenize=true;
config.lowercase=true;
config.ignorePinyinOffset = false;


result = getStringArrayListHashMap(s1, config);

Expand All @@ -605,6 +642,8 @@ public void TestOnlyLetters() throws IOException {
config.LimitFirstLetterLength=16;
config.noneChinesePinyinTokenize=true;
config.lowercase=true;
config.ignorePinyinOffset = false;


result = getStringArrayListHashMap(s1, config);

Expand All @@ -626,6 +665,8 @@ public void TestOnlyLetters() throws IOException {
config.LimitFirstLetterLength=16;
config.noneChinesePinyinTokenize=true;
config.lowercase=true;
config.ignorePinyinOffset = false;


result = getStringArrayListHashMap(s1, config);

Expand All @@ -648,6 +689,8 @@ public void TestOnlyLetters() throws IOException {
config.LimitFirstLetterLength=16;
config.noneChinesePinyinTokenize=true;
config.lowercase=true;
config.ignorePinyinOffset = false;


result = getStringArrayListHashMap(s1, config);

Expand All @@ -672,6 +715,8 @@ public void TestOnlyFirstLetterTokenizer() throws IOException {
config.keepOriginal = false;
config.keepFullPinyin = false;
config.keepNoneChineseTogether = false;
config.ignorePinyinOffset = false;


HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);

Expand All @@ -697,6 +742,8 @@ public void TestOnlyFirstLetterTokenizer() throws IOException {
config.keepOriginal = false;
config.keepFullPinyin = false;
config.keepNoneChineseTogether = false;
config.ignorePinyinOffset = false;


result = getStringArrayListHashMap(s, config);

Expand All @@ -712,6 +759,8 @@ public void TestOnlyFirstLetterTokenizer() throws IOException {
config.keepOriginal = false;
config.keepFullPinyin = false;
config.noneChinesePinyinTokenize=false;
config.ignorePinyinOffset = false;

result = getStringArrayListHashMap(s, config);

re = result.get("DJ音乐家");
Expand All @@ -736,6 +785,8 @@ public void TestFullJoinedPinyin() throws IOException{
config.keepJoinedFullPinyin=true;
config.keepNoneChineseTogether = true;
config.keepNoneChineseInJoinedFullPinyin=true;
config.ignorePinyinOffset = false;

HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);

ArrayList<TermItem> re = result.get("DJ音乐家");
Expand All @@ -759,6 +810,8 @@ public void TestMixedPinyinTokenizer() throws IOException {
config.keepOriginal = true;
config.keepFullPinyin = true;
config.keepNoneChineseTogether = true;
config.ignorePinyinOffset = false;


HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);

Expand Down Expand Up @@ -814,6 +867,8 @@ public void TestPinyinTokenizerOffsetWithExtraTerms() throws IOException {
config.removeDuplicateTerm = true;
config.fixedPinyinOffset=false;
config.keepJoinedFullPinyin=false;
config.ignorePinyinOffset = false;



HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);
Expand Down Expand Up @@ -889,6 +944,7 @@ public void TestPinyinTokenizerOffset() throws IOException {
config.keepFullPinyin = true;
config.keepNoneChineseTogether = true;
config.fixedPinyinOffset=false;
config.ignorePinyinOffset = false;

HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);

Expand Down Expand Up @@ -963,6 +1019,8 @@ public void TestPinyinTokenizerFixedOffset() throws IOException {
config.keepFullPinyin = true;
config.keepNoneChineseTogether = true;
config.fixedPinyinOffset=true;
config.ignorePinyinOffset = false;


HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);

Expand Down Expand Up @@ -1127,6 +1185,7 @@ public void TestPinyinPosition1() throws IOException {
config.keepOriginal = true;
config.keepFullPinyin = true;
config.keepNoneChineseTogether = true;
config.ignorePinyinOffset = false;

HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);

Expand Down Expand Up @@ -1178,6 +1237,8 @@ public void TestPinyinPosition2() throws IOException {
config.keepOriginal = true;
config.keepFullPinyin = true;
config.keepNoneChineseTogether = true;
config.ignorePinyinOffset = false;


HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);

Expand Down Expand Up @@ -1225,6 +1286,8 @@ public void TestPinyinPosition3() throws IOException {
config.keepOriginal = true;
config.keepFullPinyin = true;
config.keepNoneChineseTogether = true;
config.ignorePinyinOffset = false;


HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);

Expand Down Expand Up @@ -1273,6 +1336,8 @@ public void TestPinyinPosition4() throws IOException {
config.keepOriginal = true;
config.keepFullPinyin = true;
config.keepNoneChineseTogether = true;
config.ignorePinyinOffset = false;


HashMap<String, ArrayList<TermItem>> result= getStringArrayListHashMap(s, config);

Expand All @@ -1295,6 +1360,8 @@ public void TestPinyinPosition4() throws IOException {
config.keepFullPinyin = true;
config.keepNoneChineseTogether = false;
config.keepJoinedFullPinyin = true;
config.ignorePinyinOffset = false;


result = getStringArrayListHashMap(s, config);

Expand Down

0 comments on commit 438d528

Please sign in to comment.