Skip to content

Commit

Permalink
增加了词性
Browse files Browse the repository at this point in the history
  • Loading branch information
linkerlin committed Mar 21, 2014
1 parent e8fc155 commit cf276f0
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 63 deletions.
5 changes: 5 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@
<version>4.8</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.3.1</version>
</dependency>
</dependencies>

<build>
Expand Down
65 changes: 32 additions & 33 deletions src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package com.huaban.analysis.jieba;

import com.huaban.analysis.jieba.viterbi.FinalSeg;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.huaban.analysis.jieba.viterbi.FinalSeg;

public class JiebaSegmenter {
private static WordDictionary wordDict = WordDictionary.getInstance();
private static FinalSeg finalSeg = FinalSeg.getInstance();
Expand Down Expand Up @@ -88,26 +88,26 @@ public List<SegToken> process(String paragraph, SegMode mode) {
if (sb.length() > 0) {
// process
if (mode == SegMode.SEARCH) {
for (String token : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(token, offset, offset += token.length()));
for (Word word : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(word, offset, offset += word.length()));
}
} else {
for (String token : sentenceProcess(sb.toString())) {
for (Word token : sentenceProcess(sb.toString())) {
if (token.length() > 2) {
String gram2 = "";
Word gram2;
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.substring(j, j + 2);
if (wordDict.containsFreq(gram2))
gram2 = token.subSequence(j, j + 2);
if (wordDict.containsWord(gram2.getToken()))
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
}
}
if (token.length() > 3) {
String gram3 = "";
Word gram3;
int j = 0;
for (; j < token.length() - 2; ++j) {
gram3 = token.substring(j, j + 3);
if (wordDict.containsFreq(gram3))
gram3 = token.subSequence(j, j + 3);
if (wordDict.containsWord(gram3.getToken()))
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
}
}
Expand All @@ -117,31 +117,34 @@ public List<SegToken> process(String paragraph, SegMode mode) {
sb = new StringBuilder();
offset = i;
}
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
if (wordDict.containsWord(paragraph.substring(i, i + 1)))
tokens.add(new SegToken(wordDict.getWord(paragraph.substring(i, i + 1)), offset, ++offset));
else
tokens.add(new SegToken(Word.createWord(paragraph.substring(i, i + 1)), offset, ++offset));
}
}
if (sb.length() > 0)
if (mode == SegMode.SEARCH) {
for (String token : sentenceProcess(sb.toString())) {
for (Word token : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(token, offset, offset += token.length()));
}
} else {
for (String token : sentenceProcess(sb.toString())) {
for (Word token : sentenceProcess(sb.toString())) {
if (token.length() > 2) {
String gram2 = "";
Word gram2;
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.substring(j, j + 2);
if (wordDict.containsFreq(gram2))
gram2 = token.subSequence(j, j + 2);
if (wordDict.containsWord(gram2.getToken()))
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
}
}
if (token.length() > 3) {
String gram3 = "";
Word gram3;
int j = 0;
for (; j < token.length() - 2; ++j) {
gram3 = token.substring(j, j + 3);
if (wordDict.containsFreq(gram3))
gram3 = token.subSequence(j, j + 3);
if (wordDict.containsWord(gram3.getToken()))
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
}
}
Expand All @@ -152,8 +155,8 @@ public List<SegToken> process(String paragraph, SegMode mode) {
return tokens;
}

public List<String> sentenceProcess(String sentence) {
List<String> tokens = new ArrayList<String>();
public List<Word> sentenceProcess(String sentence) {
List<Word> tokens = new ArrayList<Word>();
int N = sentence.length();
Map<Integer, List<Integer>> dag = createDAG(sentence);
Map<Integer, Pair<Integer>> route = calc(sentence, dag);
Expand All @@ -169,32 +172,28 @@ public List<String> sentenceProcess(String sentence) {
else {
if (buf.length() > 0) {
if (buf.length() == 1) {
tokens.add(buf);
tokens.add(Word.createWord(buf));
buf = "";
} else {
if (wordDict.containsFreq(buf)) {
for (int i = 0; i < buf.length(); ++i) {
tokens.add(buf.substring(i, i + 1));
}
if (wordDict.containsWord(buf)) {
tokens.add(wordDict.getWord(buf));
} else {
finalSeg.cut(buf, tokens);
}
buf = "";
}
}
tokens.add(lWord);
tokens.add(Word.createWord(lWord));
}
x = y;
}
if (buf.length() > 0) {
if (buf.length() == 1) {
tokens.add(buf);
tokens.add(Word.createWord(buf));
buf = "";
} else {
if (wordDict.containsFreq(buf)) {
for (int i = 0; i < buf.length(); ++i) {
tokens.add(buf.substring(i, i + 1));
}
if (wordDict.containsWord(buf)) {
tokens.add(wordDict.getWord(buf));
} else {
finalSeg.cut(buf, tokens);
}
Expand Down
18 changes: 12 additions & 6 deletions src/main/java/com/huaban/analysis/jieba/SegToken.java
Original file line number Diff line number Diff line change
@@ -1,21 +1,27 @@
package com.huaban.analysis.jieba;

import org.apache.commons.lang3.StringUtils;

public class SegToken {
public String token;
public Word word;

public int startOffset;

public int endOffset;

public SegToken(String token, int startOffset, int endOffset) {
this.token = token;
this.startOffset = startOffset;
this.endOffset = endOffset;

public SegToken(Word word, int startOffset, int endOffset) {
this.word = word;
this.startOffset = startOffset;
this.endOffset = endOffset;
}

@Override
public String toString() {
return "[" + token + ", " + startOffset + ", " + endOffset + "]";
if (StringUtils.isBlank(this.word.getTokenType()))
return "[" + this.word.getToken() + ", " + startOffset + ", " + endOffset + "]";
else
return "[" + this.word.getToken() + ", " + startOffset + ", " + endOffset + ", " + this.word.getTokenType() + "]";
}

}
122 changes: 122 additions & 0 deletions src/main/java/com/huaban/analysis/jieba/Word.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package com.huaban.analysis.jieba;

/**
* Created by linkerlin on 3/21/14.
*/
public class Word implements CharSequence{
private String token;
private Double freq;
private String tokenType;
private static WordDictionary wordDict = WordDictionary.getInstance();

private Word(String token, Double freq, String tokenType){
this.token = token;
this.freq = freq;
this.tokenType = tokenType;
}

private Word(String token, Double freq){
this.token = token;
this.freq = freq;
this.tokenType = "";
}

private Word(String token){
this.token = token;
this.freq = 0.0;
this.tokenType = "";
}

public static Word createWord(String token, Double freq, String tokenType) {
if(wordDict.containsWord(token))
return wordDict.getWord(token);
return new Word(token, freq, tokenType);
}

public static Word createWord(String token, Double freq) {
if(wordDict.containsWord(token))
return wordDict.getWord(token);
return new Word(token, freq, "");
}

public static Word createWord(String token) {
if(wordDict.containsWord(token))
return wordDict.getWord(token);
return new Word(token, 0.0, "");
}

public String getToken() {
return token;
}

public void setToken(String token) {
this.token = token;
}

public Double getFreq() {
return freq;
}

public void setFreq(Double freq) {
this.freq = freq;
}

public String getTokenType() {
return tokenType;
}

public void setTokenType(String tokenType) {
this.tokenType = tokenType;
}

/**
* Returns the length of this character sequence. The length is the number
* of 16-bit <code>char</code>s in the sequence.</p>
*
* @return the number of <code>char</code>s in this sequence
*/
@Override
public int length() {
return token.length();
}

/**
* Returns the <code>char</code> value at the specified index. An index ranges from zero
* to <tt>length() - 1</tt>. The first <code>char</code> value of the sequence is at
* index zero, the next at index one, and so on, as for array
* indexing. </p>
* <p/>
* <p>If the <code>char</code> value specified by the index is a
* <a href="{@docRoot}/java/lang/Character.html#unicode">surrogate</a>, the surrogate
* value is returned.
*
* @param index the index of the <code>char</code> value to be returned
* @return the specified <code>char</code> value
* @throws IndexOutOfBoundsException if the <tt>index</tt> argument is negative or not less than
* <tt>length()</tt>
*/
@Override
public char charAt(int index) {
return token.charAt(index);
}

/**
* Returns a new <code>CharSequence</code> that is a subsequence of this sequence.
* The subsequence starts with the <code>char</code> value at the specified index and
* ends with the <code>char</code> value at index <tt>end - 1</tt>. The length
* (in <code>char</code>s) of the
* returned sequence is <tt>end - start</tt>, so if <tt>start == end</tt>
* then an empty sequence is returned. </p>
*
* @param start the start index, inclusive
* @param end the end index, exclusive
* @return the specified subsequence
* @throws IndexOutOfBoundsException if <tt>start</tt> or <tt>end</tt> are negative,
* if <tt>end</tt> is greater than <tt>length()</tt>,
* or if <tt>start</tt> is greater than <tt>end</tt>
*/
@Override
public Word subSequence(int start, int end) {
return createWord(token.subSequence(start, end).toString(),freq,tokenType);
}
}
34 changes: 22 additions & 12 deletions src/main/java/com/huaban/analysis/jieba/WordDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public class WordDictionary {
private static String USER_DICT_SUFFIX = ".dict";

public final TrieNode trie = new TrieNode();
public final Map<String, Double> freqs = new HashMap<String, Double>();
public final Map<String, Word> freqs = new HashMap<String, Word>();
private Double minFreq = Double.MAX_VALUE;
private Double total = 0.0;
private static boolean isLoaded = false;
Expand Down Expand Up @@ -60,18 +60,19 @@ public void loadDict() {
while (br.ready()) {
String line = br.readLine();
String[] tokens = line.split("[\t ]+");
if (tokens.length < 2) continue;
if (tokens.length < 3) continue;

String word = tokens[0];
String tokenType = tokens[2];
double freq = Double.valueOf(tokens[1]);
total += freq;
word = addWord(word);
freqs.put(word, freq);
freqs.put(word, Word.createWord(word, freq, tokenType));
}
// normalize
for (Entry<String, Double> entry : freqs.entrySet()) {
entry.setValue(Math.log(entry.getValue() / total));
minFreq = Math.min(entry.getValue(), minFreq);
for (Entry<String, Word> entry : freqs.entrySet()) {
entry.getValue().setFreq(Math.log(entry.getValue().getFreq() / total));
minFreq = Math.min(entry.getValue().getFreq(), minFreq);
}
System.out.println(String.format("main dict load finished, time elapsed %d ms",
System.currentTimeMillis() - s));
Expand Down Expand Up @@ -121,12 +122,13 @@ public void loadUserDict(File userDict) {
while (br.ready()) {
String line = br.readLine();
String[] tokens = line.split("[\t ]+");
if (tokens.length < 2) continue;
if (tokens.length < 3) continue;

String word = tokens[0];
String tokenType = tokens[2];
double freq = Double.valueOf(tokens[1]);
word = addWord(word);
freqs.put(word, Math.log(freq / total));
freqs.put(word, Word.createWord(word, Math.log(freq / total), tokenType));
count++;
}
System.out.println(String.format(
Expand All @@ -148,13 +150,21 @@ public TrieNode getTrie() {
return this.trie;
}

public boolean containsFreq(String key) {
return freqs.containsKey(key);
public boolean containsWord(String word) {
return freqs.containsKey(word);
}

public Word getWord(String token){
if(containsWord(token)){
return freqs.get(token);
} else {
return null;
}
}

public Double getFreq(String key) {
if (containsFreq(key))
return freqs.get(key);
if (containsWord(key))
return freqs.get(key).getFreq();
else
return minFreq;
}
Expand Down
Loading

0 comments on commit cf276f0

Please sign in to comment.