增加了词性

twlkyao · Mar 21, 2014 · cf276f0 · cf276f0
1 parent e8fc155
commit cf276f0
Show file tree

Hide file tree

Showing 6 changed files with 206 additions and 63 deletions.
diff --git a/pom.xml b/pom.xml
@@ -47,6 +47,11 @@
 			<version>4.8</version>
 			<scope>test</scope>
 		</dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-lang3</artifactId>
+            <version>3.3.1</version>
+        </dependency>
 	</dependencies>
 
 	<build>

diff --git a/src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java b/src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java
@@ -1,12 +1,12 @@
 package com.huaban.analysis.jieba;
 
+import com.huaban.analysis.jieba.viterbi.FinalSeg;
+
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
-import com.huaban.analysis.jieba.viterbi.FinalSeg;
-
 public class JiebaSegmenter {
     private static WordDictionary wordDict = WordDictionary.getInstance();
     private static FinalSeg finalSeg = FinalSeg.getInstance();
@@ -88,26 +88,26 @@ public List<SegToken> process(String paragraph, SegMode mode) {
                 if (sb.length() > 0) {
                     // process
                     if (mode == SegMode.SEARCH) {
-                        for (String token : sentenceProcess(sb.toString())) {
-                            tokens.add(new SegToken(token, offset, offset += token.length()));
+                        for (Word word : sentenceProcess(sb.toString())) {
+                            tokens.add(new SegToken(word, offset, offset += word.length()));
                         }
                     } else {
-                        for (String token : sentenceProcess(sb.toString())) {
+                        for (Word token : sentenceProcess(sb.toString())) {
                             if (token.length() > 2) {
-                                String gram2 = "";
+                                Word gram2;
                                 int j = 0;
                                 for (; j < token.length() - 1; ++j) {
-                                    gram2 = token.substring(j, j + 2);
-                                    if (wordDict.containsFreq(gram2))
+                                    gram2 = token.subSequence(j, j + 2);
+                                    if (wordDict.containsWord(gram2.getToken()))
                                         tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
                                 }
                             }
                             if (token.length() > 3) {
-                                String gram3 = "";
+                                Word gram3;
                                 int j = 0;
                                 for (; j < token.length() - 2; ++j) {
-                                    gram3 = token.substring(j, j + 3);
-                                    if (wordDict.containsFreq(gram3))
+                                    gram3 = token.subSequence(j, j + 3);
+                                    if (wordDict.containsWord(gram3.getToken()))
                                         tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
                                 }
                             }
@@ -117,31 +117,34 @@ public List<SegToken> process(String paragraph, SegMode mode) {
                     sb = new StringBuilder();
                     offset = i;
                 }
-                tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
+                if (wordDict.containsWord(paragraph.substring(i, i + 1)))
+                    tokens.add(new SegToken(wordDict.getWord(paragraph.substring(i, i + 1)), offset, ++offset));
+                else
+                    tokens.add(new SegToken(Word.createWord(paragraph.substring(i, i + 1)), offset, ++offset));
             }
         }
         if (sb.length() > 0)
             if (mode == SegMode.SEARCH) {
-                for (String token : sentenceProcess(sb.toString())) {
+                for (Word token : sentenceProcess(sb.toString())) {
                     tokens.add(new SegToken(token, offset, offset += token.length()));
                 }
             } else {
-                for (String token : sentenceProcess(sb.toString())) {
+                for (Word token : sentenceProcess(sb.toString())) {
                     if (token.length() > 2) {
-                        String gram2 = "";
+                        Word gram2;
                         int j = 0;
                         for (; j < token.length() - 1; ++j) {
-                            gram2 = token.substring(j, j + 2);
-                            if (wordDict.containsFreq(gram2))
+                            gram2 = token.subSequence(j, j + 2);
+                            if (wordDict.containsWord(gram2.getToken()))
                                 tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
                         }
                     }
                     if (token.length() > 3) {
-                        String gram3 = "";
+                        Word gram3;
                         int j = 0;
                         for (; j < token.length() - 2; ++j) {
-                            gram3 = token.substring(j, j + 3);
-                            if (wordDict.containsFreq(gram3))
+                            gram3 = token.subSequence(j, j + 3);
+                            if (wordDict.containsWord(gram3.getToken()))
                                 tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
                         }
                     }
@@ -152,8 +155,8 @@ public List<SegToken> process(String paragraph, SegMode mode) {
         return tokens;
     }
 
-    public List<String> sentenceProcess(String sentence) {
-        List<String> tokens = new ArrayList<String>();
+    public List<Word> sentenceProcess(String sentence) {
+        List<Word> tokens = new ArrayList<Word>();
         int N = sentence.length();
         Map<Integer, List<Integer>> dag = createDAG(sentence);
         Map<Integer, Pair<Integer>> route = calc(sentence, dag);
@@ -169,32 +172,28 @@ public List<String> sentenceProcess(String sentence) {
             else {
                 if (buf.length() > 0) {
                     if (buf.length() == 1) {
-                        tokens.add(buf);
+                        tokens.add(Word.createWord(buf));
                         buf = "";
                     } else {
-                        if (wordDict.containsFreq(buf)) {
-                            for (int i = 0; i < buf.length(); ++i) {
-                                tokens.add(buf.substring(i, i + 1));
-                            }
+                        if (wordDict.containsWord(buf)) {
+                            tokens.add(wordDict.getWord(buf));
                         } else {
                             finalSeg.cut(buf, tokens);
                         }
                         buf = "";
                     }
                 }
-                tokens.add(lWord);
+                tokens.add(Word.createWord(lWord));
             }
             x = y;
         }
         if (buf.length() > 0) {
             if (buf.length() == 1) {
-                tokens.add(buf);
+                tokens.add(Word.createWord(buf));
                 buf = "";
             } else {
-                if (wordDict.containsFreq(buf)) {
-                    for (int i = 0; i < buf.length(); ++i) {
-                        tokens.add(buf.substring(i, i + 1));
-                    }
+                if (wordDict.containsWord(buf)) {
+                    tokens.add(wordDict.getWord(buf));
                 } else {
                     finalSeg.cut(buf, tokens);
                 }

diff --git a/src/main/java/com/huaban/analysis/jieba/SegToken.java b/src/main/java/com/huaban/analysis/jieba/SegToken.java
@@ -1,21 +1,27 @@
 package com.huaban.analysis.jieba;
 
+import org.apache.commons.lang3.StringUtils;
+
 public class SegToken {
-    public String token;
+    public Word word;
 
     public int startOffset;
 
     public int endOffset;
 
-    public SegToken(String token, int startOffset, int endOffset) {
-	this.token = token;
-	this.startOffset = startOffset;
-	this.endOffset = endOffset;
+
+    public SegToken(Word word, int startOffset, int endOffset) {
+        this.word = word;
+        this.startOffset = startOffset;
+        this.endOffset = endOffset;
     }
 
     @Override
     public String toString() {
-	return "[" + token + ", " + startOffset + ", " + endOffset + "]";
+        if (StringUtils.isBlank(this.word.getTokenType()))
+            return "[" + this.word.getToken() + ", " + startOffset + ", " + endOffset + "]";
+        else
+            return "[" + this.word.getToken() + ", " + startOffset + ", " + endOffset + ", " + this.word.getTokenType() + "]";
     }
 
 }
diff --git a/src/main/java/com/huaban/analysis/jieba/Word.java b/src/main/java/com/huaban/analysis/jieba/Word.java
@@ -0,0 +1,122 @@
+package com.huaban.analysis.jieba;
+
+/**
+ * Created by linkerlin on 3/21/14.
+ */
+public class Word implements CharSequence{
+    private String token;
+    private Double freq;
+    private String tokenType;
+    private static WordDictionary wordDict = WordDictionary.getInstance();
+
+    private Word(String token, Double freq, String tokenType){
+        this.token = token;
+        this.freq = freq;
+        this.tokenType = tokenType;
+    }
+
+    private Word(String token, Double freq){
+        this.token = token;
+        this.freq = freq;
+        this.tokenType = "";
+    }
+
+    private Word(String token){
+        this.token = token;
+        this.freq = 0.0;
+        this.tokenType = "";
+    }
+
+    public static Word createWord(String token, Double freq, String tokenType) {
+        if(wordDict.containsWord(token))
+            return wordDict.getWord(token);
+        return new Word(token, freq, tokenType);
+    }
+
+    public static Word createWord(String token, Double freq) {
+        if(wordDict.containsWord(token))
+            return wordDict.getWord(token);
+        return new Word(token, freq, "");
+    }
+
+    public static Word createWord(String token) {
+        if(wordDict.containsWord(token))
+            return wordDict.getWord(token);
+        return new Word(token, 0.0, "");
+    }
+
+    public String getToken() {
+        return token;
+    }
+
+    public void setToken(String token) {
+        this.token = token;
+    }
+
+    public Double getFreq() {
+        return freq;
+    }
+
+    public void setFreq(Double freq) {
+        this.freq = freq;
+    }
+
+    public String getTokenType() {
+        return tokenType;
+    }
+
+    public void setTokenType(String tokenType) {
+        this.tokenType = tokenType;
+    }
+
+    /**
+     * Returns the length of this character sequence.  The length is the number
+     * of 16-bit <code>char</code>s in the sequence.</p>
+     *
+     * @return the number of <code>char</code>s in this sequence
+     */
+    @Override
+    public int length() {
+        return token.length();
+    }
+
+    /**
+     * Returns the <code>char</code> value at the specified index.  An index ranges from zero
+     * to <tt>length() - 1</tt>.  The first <code>char</code> value of the sequence is at
+     * index zero, the next at index one, and so on, as for array
+     * indexing. </p>
+     * <p/>
+     * <p>If the <code>char</code> value specified by the index is a
+     * <a href="{@docRoot}/java/lang/Character.html#unicode">surrogate</a>, the surrogate
+     * value is returned.
+     *
+     * @param index the index of the <code>char</code> value to be returned
+     * @return the specified <code>char</code> value
+     * @throws IndexOutOfBoundsException if the <tt>index</tt> argument is negative or not less than
+     *                                   <tt>length()</tt>
+     */
+    @Override
+    public char charAt(int index) {
+        return token.charAt(index);
+    }
+
+    /**
+     * Returns a new <code>CharSequence</code> that is a subsequence of this sequence.
+     * The subsequence starts with the <code>char</code> value at the specified index and
+     * ends with the <code>char</code> value at index <tt>end - 1</tt>.  The length
+     * (in <code>char</code>s) of the
+     * returned sequence is <tt>end - start</tt>, so if <tt>start == end</tt>
+     * then an empty sequence is returned. </p>
+     *
+     * @param start the start index, inclusive
+     * @param end   the end index, exclusive
+     * @return the specified subsequence
+     * @throws IndexOutOfBoundsException if <tt>start</tt> or <tt>end</tt> are negative,
+     *                                   if <tt>end</tt> is greater than <tt>length()</tt>,
+     *                                   or if <tt>start</tt> is greater than <tt>end</tt>
+     */
+    @Override
+    public Word subSequence(int start, int end) {
+        return createWord(token.subSequence(start, end).toString(),freq,tokenType);
+    }
+}
diff --git a/src/main/java/com/huaban/analysis/jieba/WordDictionary.java b/src/main/java/com/huaban/analysis/jieba/WordDictionary.java
@@ -19,7 +19,7 @@ public class WordDictionary {
     private static String USER_DICT_SUFFIX = ".dict";
 
     public final TrieNode trie = new TrieNode();
-    public final Map<String, Double> freqs = new HashMap<String, Double>();
+    public final Map<String, Word> freqs = new HashMap<String, Word>();
     private Double minFreq = Double.MAX_VALUE;
     private Double total = 0.0;
     private static boolean isLoaded = false;
@@ -60,18 +60,19 @@ public void loadDict() {
             while (br.ready()) {
                 String line = br.readLine();
                 String[] tokens = line.split("[\t ]+");
-                if (tokens.length < 2) continue;
+                if (tokens.length < 3) continue;
 
                 String word = tokens[0];
+                String tokenType = tokens[2];
                 double freq = Double.valueOf(tokens[1]);
                 total += freq;
                 word = addWord(word);
-                freqs.put(word, freq);
+                freqs.put(word, Word.createWord(word, freq, tokenType));
             }
             // normalize
-            for (Entry<String, Double> entry : freqs.entrySet()) {
-                entry.setValue(Math.log(entry.getValue() / total));
-                minFreq = Math.min(entry.getValue(), minFreq);
+            for (Entry<String, Word> entry : freqs.entrySet()) {
+                entry.getValue().setFreq(Math.log(entry.getValue().getFreq() / total));
+                minFreq = Math.min(entry.getValue().getFreq(), minFreq);
             }
             System.out.println(String.format("main dict load finished, time elapsed %d ms",
                     System.currentTimeMillis() - s));
@@ -121,12 +122,13 @@ public void loadUserDict(File userDict) {
             while (br.ready()) {
                 String line = br.readLine();
                 String[] tokens = line.split("[\t ]+");
-                if (tokens.length < 2) continue;
+                if (tokens.length < 3) continue;
 
                 String word = tokens[0];
+                String tokenType = tokens[2];
                 double freq = Double.valueOf(tokens[1]);
                 word = addWord(word);
-                freqs.put(word, Math.log(freq / total));
+                freqs.put(word, Word.createWord(word, Math.log(freq / total), tokenType));
                 count++;
             }
             System.out.println(String.format(
@@ -148,13 +150,21 @@ public TrieNode getTrie() {
         return this.trie;
     }
 
-    public boolean containsFreq(String key) {
-        return freqs.containsKey(key);
+    public boolean containsWord(String word) {
+        return freqs.containsKey(word);
+    }
+
+    public Word getWord(String token){
+        if(containsWord(token)){
+            return freqs.get(token);
+        } else {
+            return null;
+        }
     }
 
     public Double getFreq(String key) {
-        if (containsFreq(key))
-            return freqs.get(key);
+        if (containsWord(key))
+            return freqs.get(key).getFreq();
         else
             return minFreq;
     }