Skip to content

Commit

Permalink
Merge pull request huaban#4 from linkerlin/master
Browse files Browse the repository at this point in the history
增加了词性
  • Loading branch information
piaolingxue committed Mar 26, 2014
2 parents e8fc155 + 7eb601e commit 8ef8c23
Show file tree
Hide file tree
Showing 7 changed files with 214 additions and 67 deletions.
11 changes: 10 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@
<version>4.8</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.3.1</version>
</dependency>
</dependencies>

<build>
Expand Down Expand Up @@ -77,7 +82,11 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.7</version>
<version>2.9.1</version>
<configuration>
<encoding>UTF-8</encoding>
<docencoding>UTF-8</docencoding>
</configuration>
<executions>
<execution>
<id>attach-javadocs</id>
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/com/huaban/analysis/jieba/CharacterUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ public static boolean ccFind(char ch) {
}

/**
* 全角->半角,大写->小写
* @param input
* @return
* 全角 to 半角,大写 to 小写
* @param input 输入字符
* @return 转换后的字符
*/
public static char regularize(char input){
if (input == 12288) {
Expand Down
65 changes: 32 additions & 33 deletions src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package com.huaban.analysis.jieba;

import com.huaban.analysis.jieba.viterbi.FinalSeg;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.huaban.analysis.jieba.viterbi.FinalSeg;

public class JiebaSegmenter {
private static WordDictionary wordDict = WordDictionary.getInstance();
private static FinalSeg finalSeg = FinalSeg.getInstance();
Expand Down Expand Up @@ -88,26 +88,26 @@ public List<SegToken> process(String paragraph, SegMode mode) {
if (sb.length() > 0) {
// process
if (mode == SegMode.SEARCH) {
for (String token : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(token, offset, offset += token.length()));
for (Word word : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(word, offset, offset += word.length()));
}
} else {
for (String token : sentenceProcess(sb.toString())) {
for (Word token : sentenceProcess(sb.toString())) {
if (token.length() > 2) {
String gram2 = "";
Word gram2;
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.substring(j, j + 2);
if (wordDict.containsFreq(gram2))
gram2 = token.subSequence(j, j + 2);
if (wordDict.containsWord(gram2.getToken()))
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
}
}
if (token.length() > 3) {
String gram3 = "";
Word gram3;
int j = 0;
for (; j < token.length() - 2; ++j) {
gram3 = token.substring(j, j + 3);
if (wordDict.containsFreq(gram3))
gram3 = token.subSequence(j, j + 3);
if (wordDict.containsWord(gram3.getToken()))
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
}
}
Expand All @@ -117,31 +117,34 @@ public List<SegToken> process(String paragraph, SegMode mode) {
sb = new StringBuilder();
offset = i;
}
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
if (wordDict.containsWord(paragraph.substring(i, i + 1)))
tokens.add(new SegToken(wordDict.getWord(paragraph.substring(i, i + 1)), offset, ++offset));
else
tokens.add(new SegToken(Word.createWord(paragraph.substring(i, i + 1)), offset, ++offset));
}
}
if (sb.length() > 0)
if (mode == SegMode.SEARCH) {
for (String token : sentenceProcess(sb.toString())) {
for (Word token : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(token, offset, offset += token.length()));
}
} else {
for (String token : sentenceProcess(sb.toString())) {
for (Word token : sentenceProcess(sb.toString())) {
if (token.length() > 2) {
String gram2 = "";
Word gram2;
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.substring(j, j + 2);
if (wordDict.containsFreq(gram2))
gram2 = token.subSequence(j, j + 2);
if (wordDict.containsWord(gram2.getToken()))
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
}
}
if (token.length() > 3) {
String gram3 = "";
Word gram3;
int j = 0;
for (; j < token.length() - 2; ++j) {
gram3 = token.substring(j, j + 3);
if (wordDict.containsFreq(gram3))
gram3 = token.subSequence(j, j + 3);
if (wordDict.containsWord(gram3.getToken()))
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
}
}
Expand All @@ -152,8 +155,8 @@ public List<SegToken> process(String paragraph, SegMode mode) {
return tokens;
}

public List<String> sentenceProcess(String sentence) {
List<String> tokens = new ArrayList<String>();
public List<Word> sentenceProcess(String sentence) {
List<Word> tokens = new ArrayList<Word>();
int N = sentence.length();
Map<Integer, List<Integer>> dag = createDAG(sentence);
Map<Integer, Pair<Integer>> route = calc(sentence, dag);
Expand All @@ -169,32 +172,28 @@ public List<String> sentenceProcess(String sentence) {
else {
if (buf.length() > 0) {
if (buf.length() == 1) {
tokens.add(buf);
tokens.add(Word.createWord(buf));
buf = "";
} else {
if (wordDict.containsFreq(buf)) {
for (int i = 0; i < buf.length(); ++i) {
tokens.add(buf.substring(i, i + 1));
}
if (wordDict.containsWord(buf)) {
tokens.add(wordDict.getWord(buf));
} else {
finalSeg.cut(buf, tokens);
}
buf = "";
}
}
tokens.add(lWord);
tokens.add(Word.createWord(lWord));
}
x = y;
}
if (buf.length() > 0) {
if (buf.length() == 1) {
tokens.add(buf);
tokens.add(Word.createWord(buf));
buf = "";
} else {
if (wordDict.containsFreq(buf)) {
for (int i = 0; i < buf.length(); ++i) {
tokens.add(buf.substring(i, i + 1));
}
if (wordDict.containsWord(buf)) {
tokens.add(wordDict.getWord(buf));
} else {
finalSeg.cut(buf, tokens);
}
Expand Down
18 changes: 12 additions & 6 deletions src/main/java/com/huaban/analysis/jieba/SegToken.java
Original file line number Diff line number Diff line change
@@ -1,21 +1,27 @@
package com.huaban.analysis.jieba;

import org.apache.commons.lang3.StringUtils;

public class SegToken {
public String token;
public Word word;

public int startOffset;

public int endOffset;

public SegToken(String token, int startOffset, int endOffset) {
this.token = token;
this.startOffset = startOffset;
this.endOffset = endOffset;

public SegToken(Word word, int startOffset, int endOffset) {
this.word = word;
this.startOffset = startOffset;
this.endOffset = endOffset;
}

@Override
public String toString() {
return "[" + token + ", " + startOffset + ", " + endOffset + "]";
if (StringUtils.isBlank(this.word.getTokenType()))
return "[" + this.word.getToken() + ", " + startOffset + ", " + endOffset + "]";
else
return "[" + this.word.getToken() + ", " + startOffset + ", " + endOffset + ", " + this.word.getTokenType() + "]";
}

}
122 changes: 122 additions & 0 deletions src/main/java/com/huaban/analysis/jieba/Word.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package com.huaban.analysis.jieba;

/**
* Created by linkerlin on 3/21/14.
*/
public class Word implements CharSequence{
private String token;
private Double freq;
private String tokenType;
private static WordDictionary wordDict = WordDictionary.getInstance();

private Word(String token, Double freq, String tokenType){
this.token = token;
this.freq = freq;
this.tokenType = tokenType;
}

private Word(String token, Double freq){
this.token = token;
this.freq = freq;
this.tokenType = "";
}

private Word(String token){
this.token = token;
this.freq = 0.0;
this.tokenType = "";
}

public static Word createWord(String token, Double freq, String tokenType) {
if(wordDict.containsWord(token))
return wordDict.getWord(token);
return new Word(token, freq, tokenType);
}

public static Word createWord(String token, Double freq) {
if(wordDict.containsWord(token))
return wordDict.getWord(token);
return new Word(token, freq, "");
}

public static Word createWord(String token) {
if(wordDict.containsWord(token))
return wordDict.getWord(token);
return new Word(token, 0.0, "");
}

public String getToken() {
return token;
}

public void setToken(String token) {
this.token = token;
}

public Double getFreq() {
return freq;
}

public void setFreq(Double freq) {
this.freq = freq;
}

public String getTokenType() {
return tokenType;
}

public void setTokenType(String tokenType) {
this.tokenType = tokenType;
}

/**
* Returns the length of this character sequence. The length is the number
* of 16-bit <code>char</code>s in the sequence.
*
* @return the number of <code>char</code>s in this sequence
*/
@Override
public int length() {
return token.length();
}

/**
* Returns the <code>char</code> value at the specified index. An index ranges from zero
* to <tt>length() - 1</tt>. The first <code>char</code> value of the sequence is at
* index zero, the next at index one, and so on, as for array
* indexing.
*
* <p>If the <code>char</code> value specified by the index is a
* <a href="{@docRoot}/java/lang/Character.html#unicode">surrogate</a>, the surrogate
* value is returned.
*
* @param index the index of the <code>char</code> value to be returned
* @return the specified <code>char</code> value
* @throws IndexOutOfBoundsException if the <tt>index</tt> argument is negative or not less than
* <tt>length()</tt>
*/
@Override
public char charAt(int index) {
return token.charAt(index);
}

/**
* Returns a new <code>CharSequence</code> that is a subsequence of this sequence.
* The subsequence starts with the <code>char</code> value at the specified index and
* ends with the <code>char</code> value at index <tt>end - 1</tt>. The length
* (in <code>char</code>s) of the
* returned sequence is <tt>end - start</tt>, so if <tt>start == end</tt>
* then an empty sequence is returned.
*
* @param start the start index, inclusive
* @param end the end index, exclusive
* @return the specified subsequence
* @throws IndexOutOfBoundsException if <tt>start</tt> or <tt>end</tt> are negative,
* if <tt>end</tt> is greater than <tt>length()</tt>,
* or if <tt>start</tt> is greater than <tt>end</tt>
*/
@Override
public Word subSequence(int start, int end) {
return createWord(token.subSequence(start, end).toString(),freq,tokenType);
}
}
Loading

0 comments on commit 8ef8c23

Please sign in to comment.