Skip to content

Commit

Permalink
BSP-135: fix search Chinese character with same pinyin issue. (infini…
Browse files Browse the repository at this point in the history
  • Loading branch information
ariesy authored May 9, 2021
1 parent 2f34d08 commit 8368d1a
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 4 deletions.
2 changes: 2 additions & 0 deletions src/main/java/org/elasticsearch/analysis/PinyinConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public class PinyinConfig {
public boolean fixedPinyinOffset =false;
// after 6.0, offset is strictly constrained, overlapped tokens are not allowed, with this parameter, overlapped token will allowed by ignore offset, please note, all position related query or highlight will become incorrect, you should use multi fields and specify different settings for different query purpose. if you need offset, please set it to false. default: true.
public boolean ignorePinyinOffset =true;
public boolean keepSeparateChinese=false;

public PinyinConfig() {}
public PinyinConfig(Settings settings) {
Expand All @@ -43,5 +44,6 @@ public PinyinConfig(Settings settings) {
this.removeDuplicateTerm =settings.getAsBoolean("remove_duplicated_term", false);
this.fixedPinyinOffset =settings.getAsBoolean("fixed_pinyin_offset", false);
this.ignorePinyinOffset =settings.getAsBoolean("ignore_pinyin_offset", true);
this.keepSeparateChinese=settings.getAsBoolean("keep_separate_chinese", false);
}
}
38 changes: 38 additions & 0 deletions src/main/java/org/elasticsearch/index/analysis/ChineseUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package org.elasticsearch.index.analysis;

import org.nlpcn.commons.lang.util.StringUtil;

import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;

public class ChineseUtil {
/**
* 汉字始
*/
public static char CJK_UNIFIED_IDEOGRAPHS_START = '\u4E00';
/**
* 汉字止
*/
public static char CJK_UNIFIED_IDEOGRAPHS_END = '\u9FA5';

public static List<String> segmentChinese(String str){
if (StringUtil.isBlank(str)) {
return Collections.emptyList();
}

List<String> lists = str.length()<=32767?new ArrayList<>(str.length()):new LinkedList<>();
for (int i=0;i<str.length();i++){
char c = str.charAt(i);
if(c>=CJK_UNIFIED_IDEOGRAPHS_START&&c<=CJK_UNIFIED_IDEOGRAPHS_END){
lists.add(String.valueOf(c));
}
else{
lists.add(null);
}

}
return lists;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public PinyinTokenFilter(TokenStream in, PinyinConfig config) {
super(in);
this.config = config;
//validate config
if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) {
if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin || config.keepSeparateChinese)) {
throw new ConfigErrorException("pinyin config error, can't disable separate_first_letter, first_letter and full_pinyin at the same time.");
}
candidate = new ArrayList<>();
Expand Down Expand Up @@ -97,7 +97,8 @@ private boolean readTerm() {
}

List<String> pinyinList = Pinyin.pinyin(source);
if (pinyinList.size() == 0) return false;
List<String> chineseList = ChineseUtil.segmentChinese(source);
if (pinyinList.size() == 0 || chineseList.size() == 0) return false;

StringBuilder buff = new StringBuilder();
int buffStartPosition = 0;
Expand Down Expand Up @@ -137,6 +138,7 @@ private boolean readTerm() {
}

String pinyin = pinyinList.get(i);
String chinese = chineseList.get(i);
if (pinyin != null && pinyin.length() > 0) {
position++;
firstLetters.append(pinyin.charAt(0));
Expand All @@ -146,6 +148,9 @@ private boolean readTerm() {
if (config.keepFullPinyin) {
addCandidate(new TermItem(pinyin, i, i + 1, position));
}
if(config.keepSeparateChinese){
addCandidate(new TermItem(chinese, i, i + 1, position));
}
if (config.keepJoinedFullPinyin) {
fullPinyinLetters.append(pinyin);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public PinyinTokenizer(PinyinConfig config) {
this.config = config;

//validate config
if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) {
if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin || config.keepSeparateChinese)) {
throw new ConfigErrorException("pinyin config error, can't disable separate_first_letter, first_letter and full_pinyin at the same time.");
}
candidate = new ArrayList<>();
Expand Down Expand Up @@ -151,7 +151,8 @@ public final boolean incrementToken() throws IOException {
source = termAtt.toString();

List<String> pinyinList = Pinyin.pinyin(source);
if (pinyinList.size() == 0) return false;
List<String> chineseList = ChineseUtil.segmentChinese(source);
if (pinyinList.size() == 0 || chineseList.size() == 0) return false;

StringBuilder buff = new StringBuilder();
int buffStartPosition = 0;
Expand Down Expand Up @@ -194,6 +195,7 @@ public final boolean incrementToken() throws IOException {
boolean incrPosition = false;

String pinyin = pinyinList.get(i);
String chinese = chineseList.get(i);
if (pinyin != null && pinyin.length() > 0) {
firstLetters.append(pinyin.charAt(0));
if (config.keepSeparateFirstLetter & pinyin.length() > 1) {
Expand All @@ -207,6 +209,9 @@ public final boolean incrementToken() throws IOException {
}
addCandidate(new TermItem(pinyin, i, i + 1, position));
}
if(config.keepSeparateChinese){
addCandidate(new TermItem(chinese, i, i + 1, position));
}
if (config.keepJoinedFullPinyin) {
fullPinyinLetters.append(pinyin);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,47 @@ public void TestMixedPinyinTokenizer() throws IOException {

}

@Test
public void TestMixedPinyinTokenizer2() throws IOException {
String[] s =
{
"ldh",
"ldhua",
"ld华",
"刘德华",
"刘de华",
"liude华",
" liude 华"};

PinyinConfig config = new PinyinConfig();
config.keepFirstLetter = false;
config.keepSeparateFirstLetter = false;
config.keepNoneChinese = true;
config.keepOriginal = true;
config.keepFullPinyin = false;
config.keepNoneChineseTogether = true;
config.ignorePinyinOffset = false;
config.keepSeparateChinese = true;


HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);

ArrayList<TermItem> re = result.get("ldh");
Assert.assertEquals(4, re.size());
Assert.assertEquals("l", re.get(0).term);
Assert.assertEquals("ldh", re.get(1).term);

re = result.get("ldhua");
Assert.assertEquals(4, re.size());
Assert.assertEquals("l", re.get(0).term);
Assert.assertEquals("hua", re.get(3).term);

re = result.get("ld华");
Assert.assertEquals(4, re.size());
Assert.assertEquals("华", re.get(3).term);

}

@Test
public void TestPinyinTokenizerOffsetWithExtraTerms() throws IOException {
String[] s =
Expand Down

0 comments on commit 8368d1a

Please sign in to comment.