Skip to content

Commit

Permalink
fix position
Browse files Browse the repository at this point in the history
  • Loading branch information
medcl committed Jun 2, 2020
1 parent ba40f18 commit 473175c
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 15 deletions.
43 changes: 28 additions & 15 deletions src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,11 @@ void addCandidate(TermItem item) {
}

//remove same term with same position
String fr=term+item.position;
String fr = term + item.position;

//remove same term, regardless position
if (config.removeDuplicateTerm) {
fr=term;
fr = term;
}

if (termsFilter.contains(fr)) {
Expand All @@ -102,7 +102,7 @@ void setTerm(String term, int startOffset, int endOffset, int position) {
}

//ignore empty term
if(term.length()==0){
if (term.length() == 0) {
return;
}

Expand All @@ -115,7 +115,7 @@ void setTerm(String term, int startOffset, int endOffset, int position) {
endOffset = startOffset + term.length();
}

if(!config.ignorePinyinOffset){
if (!config.ignorePinyinOffset) {
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
}

Expand Down Expand Up @@ -160,21 +160,21 @@ public final boolean incrementToken() throws IOException {
position = 0;

for (int i = 0; i < source.length(); i++) {

char c = source.charAt(i);
//keep original alphabet
if (c < 128) {
if (buff.length() <= 0) {
buffStartPosition = i+1;
buffStartPosition = i;
}
if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) {
if (config.keepNoneChinese) {
if (config.keepNoneChinese) {
if (config.keepNoneChineseTogether) {
buff.append(c);
buffSize++;
} else {
addCandidate(new TermItem(String.valueOf(c), i, i + 1, buffStartPosition));
}
if (config.keepNoneChineseTogether) {
buff.append(c);
buffSize++;
} else {
position++;
addCandidate(new TermItem(String.valueOf(c), i, i + 1, buffStartPosition + 1));
}
}
if (config.keepNoneChineseInFirstLetter) {
Expand All @@ -191,14 +191,20 @@ public final boolean incrementToken() throws IOException {
buffSize = parseBuff(buff, buffSize, buffStartPosition);
}

boolean incrPosition = false;

String pinyin = pinyinList.get(i);
if (pinyin != null && pinyin.length() > 0) {
position++;
firstLetters.append(pinyin.charAt(0));
if (config.keepSeparateFirstLetter & pinyin.length() > 1) {
position++;
incrPosition = true;
addCandidate(new TermItem(String.valueOf(pinyin.charAt(0)), i, i + 1, position));
}
if (config.keepFullPinyin) {
if (!incrPosition) {
position++;
}
addCandidate(new TermItem(pinyin, i, i + 1, position));
}
if (config.keepJoinedFullPinyin) {
Expand Down Expand Up @@ -277,11 +283,13 @@ private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) {
} else {
end = start + t.length();
}
addCandidate(new TermItem(result.get(i), start, end, ++position));
position++;
addCandidate(new TermItem(result.get(i), start, end, position));
start = end;
}
} else if (config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || !config.keepNoneChineseInJoinedFullPinyin) {
addCandidate(new TermItem(buff.toString(), lastOffset - buffSize, lastOffset, ++position));
position++;
addCandidate(new TermItem(buff.toString(), lastOffset - buffSize, lastOffset, position));
}
}

Expand All @@ -293,6 +301,10 @@ private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) {
@Override
public final void end() throws IOException {
super.end();
if (!config.ignorePinyinOffset) {
++lastOffset;
offsetAtt.setOffset(correctOffset(lastOffset), correctOffset(lastOffset));
}
}

@Override
Expand All @@ -312,6 +324,7 @@ public void reset() throws IOException {
candidate.clear();
source = null;
lastIncrementPosition = 0;
lastOffset = 0;
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -767,7 +767,9 @@ public void TestOnlyFirstLetterTokenizer() throws IOException {
re = result.get("DJ音乐家");
Assert.assertEquals(2, re.size());
Assert.assertEquals("dj", re.get(0).term);
Assert.assertEquals(1, re.get(0).position);
Assert.assertEquals("djyyj", re.get(1).term);
Assert.assertEquals(1, re.get(1).position);

}

Expand Down Expand Up @@ -1274,8 +1276,100 @@ public void TestPinyinPosition2() throws IOException {
Assert.assertEquals(2, re.get(6).startOffset);
Assert.assertEquals(3, re.get(6).endOffset);
Assert.assertEquals(3, re.get(6).position);



}

@Test
public void TestPinyinPositionWithNonChinese() throws IOException {
String[] s ={
"l德华",
"liu德华"
};

PinyinConfig config = new PinyinConfig();
config.keepFirstLetter = false;
config.keepSeparateFirstLetter = true;

config.keepNoneChinese = true;
config.keepNoneChineseTogether=false;

config.keepNoneChineseInFirstLetter=true;
config.keepOriginal = false;
config.keepFullPinyin = false;
config.ignorePinyinOffset = false;


HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);

ArrayList<TermItem> re = result.get("l德华");
Assert.assertEquals("l", re.get(0).term);
Assert.assertEquals(0, re.get(0).startOffset);
Assert.assertEquals(1, re.get(0).endOffset);
Assert.assertEquals(1, re.get(0).position);

Assert.assertEquals("d", re.get(1).term);
Assert.assertEquals(1, re.get(1).startOffset);
Assert.assertEquals(2, re.get(1).endOffset);
Assert.assertEquals(2, re.get(1).position);

Assert.assertEquals("h", re.get(2).term);
Assert.assertEquals(2, re.get(2).startOffset);
Assert.assertEquals(3, re.get(2).endOffset);
Assert.assertEquals(3, re.get(2).position);


config = new PinyinConfig();
config.keepFirstLetter = false;
config.keepSeparateFirstLetter = true;

config.keepNoneChinese = true;
config.keepNoneChineseTogether=true;

config.keepNoneChineseInFirstLetter=true;
config.keepOriginal = false;
config.keepFullPinyin = false;
config.ignorePinyinOffset = false;


result = getStringArrayListHashMap(s, config);

re = result.get("l德华");
Assert.assertEquals("l", re.get(0).term);
Assert.assertEquals(0, re.get(0).startOffset);
Assert.assertEquals(1, re.get(0).endOffset);
Assert.assertEquals(1, re.get(0).position);

Assert.assertEquals("d", re.get(1).term);
Assert.assertEquals(1, re.get(1).startOffset);
Assert.assertEquals(2, re.get(1).endOffset);
Assert.assertEquals(2, re.get(1).position);

Assert.assertEquals("h", re.get(2).term);
Assert.assertEquals(2, re.get(2).startOffset);
Assert.assertEquals(3, re.get(2).endOffset);
Assert.assertEquals(3, re.get(2).position);

re = result.get("liu德华");
Assert.assertEquals("liu", re.get(0).term);
Assert.assertEquals(0, re.get(0).startOffset);
Assert.assertEquals(3, re.get(0).endOffset);
Assert.assertEquals(1, re.get(0).position);

Assert.assertEquals("d", re.get(1).term);
Assert.assertEquals(3, re.get(1).startOffset);
Assert.assertEquals(4, re.get(1).endOffset);
Assert.assertEquals(2, re.get(1).position);

Assert.assertEquals("h", re.get(2).term);
Assert.assertEquals(4, re.get(2).startOffset);
Assert.assertEquals(5, re.get(2).endOffset);
Assert.assertEquals(3, re.get(2).position);

}


@Test
public void TestPinyinPosition3() throws IOException {
String[] s ={ "liude华","liudehua","ldhua","刘de华","刘dehua","DJ音乐家"};
Expand Down

0 comments on commit 473175c

Please sign in to comment.