Merge pull request huaban#4 from linkerlin/master

增加了词性
wyanOrg · Mar 26, 2014 · 8ef8c23 · 8ef8c23
2 parents e8fc155 + 7eb601e
commit 8ef8c23
Show file tree

Hide file tree

Showing 7 changed files with 214 additions and 67 deletions.
diff --git a/pom.xml b/pom.xml
@@ -47,6 +47,11 @@
 			<version>4.8</version>
 			<scope>test</scope>
 		</dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-lang3</artifactId>
+            <version>3.3.1</version>
+        </dependency>
 	</dependencies>
 
 	<build>
@@ -77,7 +82,11 @@
 			<plugin>
 				<groupId>org.apache.maven.plugins</groupId>
 				<artifactId>maven-javadoc-plugin</artifactId>
-				<version>2.7</version>
+				<version>2.9.1</version>
+                <configuration>
+                    <encoding>UTF-8</encoding>
+                    <docencoding>UTF-8</docencoding>
+                </configuration>
 				<executions>
 					<execution>
 						<id>attach-javadocs</id>

diff --git a/src/main/java/com/huaban/analysis/jieba/CharacterUtil.java b/src/main/java/com/huaban/analysis/jieba/CharacterUtil.java
@@ -37,9 +37,9 @@ public static boolean ccFind(char ch) {
     }
 
     /**
-     * 全角->半角,大写->小写
-     * @param input
-     * @return
+     * 全角 to 半角,大写 to 小写
+     * @param input 输入字符
+     * @return 转换后的字符
      */
 	public static char regularize(char input){
         if (input == 12288) {

diff --git a/src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java b/src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java
@@ -1,12 +1,12 @@
 package com.huaban.analysis.jieba;
 
+import com.huaban.analysis.jieba.viterbi.FinalSeg;
+
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
-import com.huaban.analysis.jieba.viterbi.FinalSeg;
-
 public class JiebaSegmenter {
     private static WordDictionary wordDict = WordDictionary.getInstance();
     private static FinalSeg finalSeg = FinalSeg.getInstance();
@@ -88,26 +88,26 @@ public List<SegToken> process(String paragraph, SegMode mode) {
                 if (sb.length() > 0) {
                     // process
                     if (mode == SegMode.SEARCH) {
-                        for (String token : sentenceProcess(sb.toString())) {
-                            tokens.add(new SegToken(token, offset, offset += token.length()));
+                        for (Word word : sentenceProcess(sb.toString())) {
+                            tokens.add(new SegToken(word, offset, offset += word.length()));
                         }
                     } else {
-                        for (String token : sentenceProcess(sb.toString())) {
+                        for (Word token : sentenceProcess(sb.toString())) {
                             if (token.length() > 2) {
-                                String gram2 = "";
+                                Word gram2;
                                 int j = 0;
                                 for (; j < token.length() - 1; ++j) {
-                                    gram2 = token.substring(j, j + 2);
-                                    if (wordDict.containsFreq(gram2))
+                                    gram2 = token.subSequence(j, j + 2);
+                                    if (wordDict.containsWord(gram2.getToken()))
                                         tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
                                 }
                             }
                             if (token.length() > 3) {
-                                String gram3 = "";
+                                Word gram3;
                                 int j = 0;
                                 for (; j < token.length() - 2; ++j) {
-                                    gram3 = token.substring(j, j + 3);
-                                    if (wordDict.containsFreq(gram3))
+                                    gram3 = token.subSequence(j, j + 3);
+                                    if (wordDict.containsWord(gram3.getToken()))
                                         tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
                                 }
                             }
@@ -117,31 +117,34 @@ public List<SegToken> process(String paragraph, SegMode mode) {
                     sb = new StringBuilder();
                     offset = i;
                 }
-                tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
+                if (wordDict.containsWord(paragraph.substring(i, i + 1)))
+                    tokens.add(new SegToken(wordDict.getWord(paragraph.substring(i, i + 1)), offset, ++offset));
+                else
+                    tokens.add(new SegToken(Word.createWord(paragraph.substring(i, i + 1)), offset, ++offset));
             }
         }
         if (sb.length() > 0)
             if (mode == SegMode.SEARCH) {
-                for (String token : sentenceProcess(sb.toString())) {
+                for (Word token : sentenceProcess(sb.toString())) {
                     tokens.add(new SegToken(token, offset, offset += token.length()));
                 }
             } else {
-                for (String token : sentenceProcess(sb.toString())) {
+                for (Word token : sentenceProcess(sb.toString())) {
                     if (token.length() > 2) {
-                        String gram2 = "";
+                        Word gram2;
                         int j = 0;
                         for (; j < token.length() - 1; ++j) {
-                            gram2 = token.substring(j, j + 2);
-                            if (wordDict.containsFreq(gram2))
+                            gram2 = token.subSequence(j, j + 2);
+                            if (wordDict.containsWord(gram2.getToken()))
                                 tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
                         }
                     }
                     if (token.length() > 3) {
-                        String gram3 = "";
+                        Word gram3;
                         int j = 0;
                         for (; j < token.length() - 2; ++j) {
-                            gram3 = token.substring(j, j + 3);
-                            if (wordDict.containsFreq(gram3))
+                            gram3 = token.subSequence(j, j + 3);
+                            if (wordDict.containsWord(gram3.getToken()))
                                 tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
                         }
                     }
@@ -152,8 +155,8 @@ public List<SegToken> process(String paragraph, SegMode mode) {
         return tokens;
     }
 
-    public List<String> sentenceProcess(String sentence) {
-        List<String> tokens = new ArrayList<String>();
+    public List<Word> sentenceProcess(String sentence) {
+        List<Word> tokens = new ArrayList<Word>();
         int N = sentence.length();
         Map<Integer, List<Integer>> dag = createDAG(sentence);
         Map<Integer, Pair<Integer>> route = calc(sentence, dag);
@@ -169,32 +172,28 @@ public List<String> sentenceProcess(String sentence) {
             else {
                 if (buf.length() > 0) {
                     if (buf.length() == 1) {
-                        tokens.add(buf);
+                        tokens.add(Word.createWord(buf));
                         buf = "";
                     } else {
-                        if (wordDict.containsFreq(buf)) {
-                            for (int i = 0; i < buf.length(); ++i) {
-                                tokens.add(buf.substring(i, i + 1));
-                            }
+                        if (wordDict.containsWord(buf)) {
+                            tokens.add(wordDict.getWord(buf));
                         } else {
                             finalSeg.cut(buf, tokens);
                         }
                         buf = "";
                     }
                 }
-                tokens.add(lWord);
+                tokens.add(Word.createWord(lWord));
             }
             x = y;
         }
         if (buf.length() > 0) {
             if (buf.length() == 1) {
-                tokens.add(buf);
+                tokens.add(Word.createWord(buf));
                 buf = "";
             } else {
-                if (wordDict.containsFreq(buf)) {
-                    for (int i = 0; i < buf.length(); ++i) {
-                        tokens.add(buf.substring(i, i + 1));
-                    }
+                if (wordDict.containsWord(buf)) {
+                    tokens.add(wordDict.getWord(buf));
                 } else {
                     finalSeg.cut(buf, tokens);
                 }

diff --git a/src/main/java/com/huaban/analysis/jieba/SegToken.java b/src/main/java/com/huaban/analysis/jieba/SegToken.java
@@ -1,21 +1,27 @@
 package com.huaban.analysis.jieba;
 
+import org.apache.commons.lang3.StringUtils;
+
 public class SegToken {
-    public String token;
+    public Word word;
 
     public int startOffset;
 
     public int endOffset;
 
-    public SegToken(String token, int startOffset, int endOffset) {
-	this.token = token;
-	this.startOffset = startOffset;
-	this.endOffset = endOffset;
+
+    public SegToken(Word word, int startOffset, int endOffset) {
+        this.word = word;
+        this.startOffset = startOffset;
+        this.endOffset = endOffset;
     }
 
     @Override
     public String toString() {
-	return "[" + token + ", " + startOffset + ", " + endOffset + "]";
+        if (StringUtils.isBlank(this.word.getTokenType()))
+            return "[" + this.word.getToken() + ", " + startOffset + ", " + endOffset + "]";
+        else
+            return "[" + this.word.getToken() + ", " + startOffset + ", " + endOffset + ", " + this.word.getTokenType() + "]";
     }
 
 }
diff --git a/src/main/java/com/huaban/analysis/jieba/Word.java b/src/main/java/com/huaban/analysis/jieba/Word.java
@@ -0,0 +1,122 @@
+package com.huaban.analysis.jieba;
+
+/**
+ * Created by linkerlin on 3/21/14.
+ */
+public class Word implements CharSequence{
+    private String token;
+    private Double freq;
+    private String tokenType;
+    private static WordDictionary wordDict = WordDictionary.getInstance();
+
+    private Word(String token, Double freq, String tokenType){
+        this.token = token;
+        this.freq = freq;
+        this.tokenType = tokenType;
+    }
+
+    private Word(String token, Double freq){
+        this.token = token;
+        this.freq = freq;
+        this.tokenType = "";
+    }
+
+    private Word(String token){
+        this.token = token;
+        this.freq = 0.0;
+        this.tokenType = "";
+    }
+
+    public static Word createWord(String token, Double freq, String tokenType) {
+        if(wordDict.containsWord(token))
+            return wordDict.getWord(token);
+        return new Word(token, freq, tokenType);
+    }
+
+    public static Word createWord(String token, Double freq) {
+        if(wordDict.containsWord(token))
+            return wordDict.getWord(token);
+        return new Word(token, freq, "");
+    }
+
+    public static Word createWord(String token) {
+        if(wordDict.containsWord(token))
+            return wordDict.getWord(token);
+        return new Word(token, 0.0, "");
+    }
+
+    public String getToken() {
+        return token;
+    }
+
+    public void setToken(String token) {
+        this.token = token;
+    }
+
+    public Double getFreq() {
+        return freq;
+    }
+
+    public void setFreq(Double freq) {
+        this.freq = freq;
+    }
+
+    public String getTokenType() {
+        return tokenType;
+    }
+
+    public void setTokenType(String tokenType) {
+        this.tokenType = tokenType;
+    }
+
+    /**
+     * Returns the length of this character sequence.  The length is the number
+     * of 16-bit <code>char</code>s in the sequence.
+     *
+     * @return the number of <code>char</code>s in this sequence
+     */
+    @Override
+    public int length() {
+        return token.length();
+    }
+
+    /**
+     * Returns the <code>char</code> value at the specified index.  An index ranges from zero
+     * to <tt>length() - 1</tt>.  The first <code>char</code> value of the sequence is at
+     * index zero, the next at index one, and so on, as for array
+     * indexing.
+     *
+     * <p>If the <code>char</code> value specified by the index is a
+     * <a href="{@docRoot}/java/lang/Character.html#unicode">surrogate</a>, the surrogate
+     * value is returned.
+     *
+     * @param index the index of the <code>char</code> value to be returned
+     * @return the specified <code>char</code> value
+     * @throws IndexOutOfBoundsException if the <tt>index</tt> argument is negative or not less than
+     *                                   <tt>length()</tt>
+     */
+    @Override
+    public char charAt(int index) {
+        return token.charAt(index);
+    }
+
+    /**
+     * Returns a new <code>CharSequence</code> that is a subsequence of this sequence.
+     * The subsequence starts with the <code>char</code> value at the specified index and
+     * ends with the <code>char</code> value at index <tt>end - 1</tt>.  The length
+     * (in <code>char</code>s) of the
+     * returned sequence is <tt>end - start</tt>, so if <tt>start == end</tt>
+     * then an empty sequence is returned.
+     *
+     * @param start the start index, inclusive
+     * @param end   the end index, exclusive
+     * @return the specified subsequence
+     * @throws IndexOutOfBoundsException if <tt>start</tt> or <tt>end</tt> are negative,
+     *                                   if <tt>end</tt> is greater than <tt>length()</tt>,
+     *                                   or if <tt>start</tt> is greater than <tt>end</tt>
+     */
+    @Override
+    public Word subSequence(int start, int end) {
+        return createWord(token.subSequence(start, end).toString(),freq,tokenType);
+    }
+}