因为性能问题，暂时去除词性标注，希望有更好的 pull request 可以提供该功能

wyanOrg · Aug 13, 2014 · be099cb · be099cb
1 parent 6954ee6
commit be099cb
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 97 deletions.
diff --git a/src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java b/src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java
@@ -92,27 +92,27 @@ public List<SegToken> process(String paragraph, SegMode mode) {
                 if (sb.length() > 0) {
                     // process
                     if (mode == SegMode.SEARCH) {
-                        for (Word word : sentenceProcess(sb.toString())) {
+                        for (String word : sentenceProcess(sb.toString())) {
                             tokens.add(new SegToken(word, offset, offset += word.length()));
                         }
                     }
                     else {
-                        for (Word token : sentenceProcess(sb.toString())) {
+                        for (String token : sentenceProcess(sb.toString())) {
                             if (token.length() > 2) {
-                                Word gram2;
+                                String gram2;
                                 int j = 0;
                                 for (; j < token.length() - 1; ++j) {
-                                    gram2 = token.subSequence(j, j + 2);
-                                    if (wordDict.containsWord(gram2.getToken()))
+                                    gram2 = token.substring(j, j + 2);
+                                    if (wordDict.containsWord(gram2))
                                         tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
                                 }
                             }
                             if (token.length() > 3) {
-                                Word gram3;
+                                String gram3;
                                 int j = 0;
                                 for (; j < token.length() - 2; ++j) {
-                                    gram3 = token.subSequence(j, j + 3);
-                                    if (wordDict.containsWord(gram3.getToken()))
+                                    gram3 = token.substring(j, j + 3);
+                                    if (wordDict.containsWord(gram3))
                                         tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
                                 }
                             }
@@ -123,36 +123,34 @@ public List<SegToken> process(String paragraph, SegMode mode) {
                     offset = i;
                 }
                 if (wordDict.containsWord(paragraph.substring(i, i + 1)))
-                    tokens
-                        .add(new SegToken(wordDict.getWord(paragraph.substring(i, i + 1)), offset, ++offset));
+                    tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
                 else
-                    tokens.add(new SegToken(wordDict.createWord(paragraph.substring(i, i + 1)), offset,
-                        ++offset));
+                    tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
             }
         }
         if (sb.length() > 0)
             if (mode == SegMode.SEARCH) {
-                for (Word token : sentenceProcess(sb.toString())) {
+                for (String token : sentenceProcess(sb.toString())) {
                     tokens.add(new SegToken(token, offset, offset += token.length()));
                 }
             }
             else {
-                for (Word token : sentenceProcess(sb.toString())) {
+                for (String token : sentenceProcess(sb.toString())) {
                     if (token.length() > 2) {
-                        Word gram2;
+                        String gram2;
                         int j = 0;
                         for (; j < token.length() - 1; ++j) {
-                            gram2 = token.subSequence(j, j + 2);
-                            if (wordDict.containsWord(gram2.getToken()))
+                            gram2 = token.substring(j, j + 2);
+                            if (wordDict.containsWord(gram2))
                                 tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
                         }
                     }
                     if (token.length() > 3) {
-                        Word gram3;
+                        String gram3;
                         int j = 0;
                         for (; j < token.length() - 2; ++j) {
-                            gram3 = token.subSequence(j, j + 3);
-                            if (wordDict.containsWord(gram3.getToken()))
+                            gram3 = token.substring(j, j + 3);
+                            if (wordDict.containsWord(gram3))
                                 tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
                         }
                     }
@@ -167,8 +165,8 @@ public List<SegToken> process(String paragraph, SegMode mode) {
     /*
      * 
      */
-    public List<Word> sentenceProcess(String sentence) {
-        List<Word> tokens = new ArrayList<Word>();
+    public List<String> sentenceProcess(String sentence) {
+        List<String> tokens = new ArrayList<String>();
         int N = sentence.length();
         Map<Integer, List<Integer>> dag = createDAG(sentence);
         Map<Integer, Pair<Integer>> route = calc(sentence, dag);
@@ -184,31 +182,31 @@ public List<Word> sentenceProcess(String sentence) {
             else {
                 if (buf.length() > 0) {
                     if (buf.length() == 1) {
-                        tokens.add(wordDict.createWord(buf));
+                        tokens.add(buf.toString());
                         buf = "";
                     }
                     else {
                         if (wordDict.containsWord(buf)) {
-                            tokens.add(wordDict.getWord(buf));
+                            tokens.add(buf.toString());
                         }
                         else {
                             finalSeg.cut(buf, tokens);
                         }
                         buf = "";
                     }
                 }
-                tokens.add(wordDict.createWord(lWord));
+                tokens.add(lWord);
             }
             x = y;
         }
         if (buf.length() > 0) {
             if (buf.length() == 1) {
-                tokens.add(wordDict.createWord(buf));
+                tokens.add(buf.toString());
                 buf = "";
             }
             else {
                 if (wordDict.containsWord(buf)) {
-                    tokens.add(wordDict.getWord(buf));
+                    tokens.add(buf.toString());
                 }
                 else {
                     finalSeg.cut(buf, tokens);

diff --git a/src/main/java/com/huaban/analysis/jieba/SegToken.java b/src/main/java/com/huaban/analysis/jieba/SegToken.java
@@ -1,27 +1,23 @@
 package com.huaban.analysis.jieba;
 
-import org.apache.commons.lang3.StringUtils;
-
 public class SegToken {
-    public Word word;
+    public String word;
 
     public int startOffset;
 
     public int endOffset;
 
 
-    public SegToken(Word word, int startOffset, int endOffset) {
+    public SegToken(String word, int startOffset, int endOffset) {
         this.word = word;
         this.startOffset = startOffset;
         this.endOffset = endOffset;
     }
 
+
     @Override
     public String toString() {
-        if (StringUtils.isBlank(this.word.getTokenType()))
-            return "[" + this.word.getToken() + ", " + startOffset + ", " + endOffset + "]";
-        else
-            return "[" + this.word.getToken() + ", " + startOffset + ", " + endOffset + ", " + this.word.getTokenType() + "]";
+        return "[" + word + ", " + startOffset + ", " + endOffset + "]";
     }
 
 }
diff --git a/src/main/java/com/huaban/analysis/jieba/WordDictionary.java b/src/main/java/com/huaban/analysis/jieba/WordDictionary.java
@@ -20,7 +20,7 @@ public class WordDictionary {
     private static final String MAIN_DICT = "/dict.txt";
     private static String USER_DICT_SUFFIX = ".dict";
 
-    public final Map<String, Word> freqs = new HashMap<String, Word>();
+    public final Map<String, Double> freqs = new HashMap<String, Double>();
     public final Set<String> loadedPath = new HashSet<String>();
     private Double minFreq = Double.MAX_VALUE;
     private Double total = 0.0;
@@ -77,20 +77,19 @@ public void loadDict() {
                 String line = br.readLine();
                 String[] tokens = line.split("[\t ]+");
 
-                if (tokens.length < 3)
+                if (tokens.length < 2)
                     continue;
 
                 String word = tokens[0];
-                String tokenType = tokens[2];
                 double freq = Double.valueOf(tokens[1]);
                 total += freq;
                 word = addWord(word);
-                freqs.put(word, createWord(word, freq, tokenType));
+                freqs.put(word, freq);
             }
             // normalize
-            for (Entry<String, Word> entry : freqs.entrySet()) {
-                entry.getValue().setFreq(Math.log(entry.getValue().getFreq() / total));
-                minFreq = Math.min(entry.getValue().getFreq(), minFreq);
+            for (Entry<String, Double> entry : freqs.entrySet()) {
+                entry.setValue((Math.log(entry.getValue() / total)));
+                minFreq = Math.min(entry.getValue(), minFreq);
             }
             System.out.println(String.format("main dict load finished, time elapsed %d ms",
                 System.currentTimeMillis() - s));
@@ -144,23 +143,13 @@ public void loadUserDict(File userDict, Charset charset) {
                 String line = br.readLine();
                 String[] tokens = line.split("[\t ]+");
 
-                if (tokens.length < 1)
+                if (tokens.length < 2)
                     continue;
 
                 String word = tokens[0];
+                double freq = Double.valueOf(tokens[1]);
                 word = addWord(word);
-                if (tokens.length == 1) {
-                    freqs.put(word, createWord(word, Math.log(3.0 / total)));
-                }
-                else if (tokens.length == 2) {
-                    double freq = Double.valueOf(tokens[1]);
-                    freqs.put(word, createWord(word, Math.log(freq / total)));
-                }
-                else {
-                    String tokenType = tokens[2];
-                    double freq = Double.valueOf(tokens[1]);
-                    freqs.put(word, createWord(word, Math.log(freq / total), tokenType));
-                }
+                freqs.put(word, Math.log(freq / total));
                 count++;
             }
             System.out.println(String.format("user dict %s load finished, tot words:%d, time elapsed:%dms",
@@ -191,37 +180,10 @@ public boolean containsWord(String word) {
     }
 
 
-    public Word getWord(String token) {
-        if (containsWord(token)) {
-            return freqs.get(token);
-        }
-        else {
-            return null;
-        }
-    }
-
-
     public Double getFreq(String key) {
         if (containsWord(key))
-            return freqs.get(key).getFreq();
+            return freqs.get(key);
         else
             return minFreq;
     }
-
-
-    public Word createWord(String token, Double freq, String tokenType) {
-        if (freqs.containsKey(token))
-            return freqs.get(token);
-        return new Word(token, freq, tokenType);
-    }
-
-
-    public Word createWord(String token, Double freq) {
-        return createWord(token, freq, "");
-    }
-
-
-    public Word createWord(String token) {
-        return createWord(token, 0.0, "");
-    }
 }
diff --git a/src/main/java/com/huaban/analysis/jieba/viterbi/FinalSeg.java b/src/main/java/com/huaban/analysis/jieba/viterbi/FinalSeg.java
@@ -13,8 +13,6 @@
 
 import com.huaban.analysis.jieba.CharacterUtil;
 import com.huaban.analysis.jieba.Pair;
-import com.huaban.analysis.jieba.Word;
-import com.huaban.analysis.jieba.WordDictionary;
 
 
 public class FinalSeg {
@@ -26,7 +24,6 @@ public class FinalSeg {
     private static Map<Character, Map<Character, Double>> trans;
     private static Map<Character, char[]> prevStatus;
     private static Double MIN_FLOAT = -3.14e100;;
-    private WordDictionary wordDict = WordDictionary.getInstance();
 
 
     private FinalSeg() {
@@ -108,7 +105,7 @@ private void loadModel() {
     }
 
 
-    public void cut(String sentence, List<Word> tokens) {
+    public void cut(String sentence, List<String> tokens) {
         StringBuilder chinese = new StringBuilder();
         StringBuilder other = new StringBuilder();
         for (int i = 0; i < sentence.length(); ++i) {
@@ -137,7 +134,7 @@ public void cut(String sentence, List<Word> tokens) {
     }
 
 
-    public void viterbi(String sentence, List<Word> tokens) {
+    public void viterbi(String sentence, List<String> tokens) {
         Vector<Map<Character, Double>> v = new Vector<Map<Character, Double>>();
         Map<Character, Vector<Character>> path = new HashMap<Character, Vector<Character>>();
 
@@ -194,30 +191,30 @@ else if (candidate.freq <= tranp) {
             if (pos == 'B')
                 begin = i;
             else if (pos == 'E') {
-                tokens.add(wordDict.createWord(sentence.substring(begin, i + 1)));
+                tokens.add(sentence.substring(begin, i + 1));
                 next = i + 1;
             }
             else if (pos == 'S') {
-                tokens.add(wordDict.createWord(sentence.substring(i, i + 1)));
+                tokens.add(sentence.substring(i, i + 1));
                 next = i + 1;
             }
         }
         if (next < sentence.length())
-            tokens.add(wordDict.createWord(sentence.substring(next)));
+            tokens.add(sentence.substring(next));
     }
 
 
-    private void processOtherUnknownWords(String other, List<Word> tokens) {
+    private void processOtherUnknownWords(String other, List<String> tokens) {
         Matcher mat = CharacterUtil.reSkip.matcher(other);
         int offset = 0;
         while (mat.find()) {
             if (mat.start() > offset) {
-                tokens.add(wordDict.createWord(other.substring(offset, mat.start())));
+                tokens.add(other.substring(offset, mat.start()));
             }
-            tokens.add(wordDict.createWord(mat.group()));
+            tokens.add(mat.group());
             offset = mat.end();
         }
         if (offset < other.length())
-            tokens.add(wordDict.createWord(other.substring(offset)));
+            tokens.add(other.substring(offset));
     }
 }
diff --git a/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java b/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterTest.java
@@ -163,14 +163,14 @@ public void testSegmentSpeed() {
         long length = 0L;
         long wordCount = 0L;
         long start = System.currentTimeMillis();
-        for (int i = 0; i < 20000; ++i)
+        for (int i = 0; i < 2000; ++i)
             for (String sentence : sentences) {
                 segmenter.process(sentence, SegMode.INDEX);
                 length += sentence.getBytes().length;
                 wordCount += sentence.length();
             }
         long elapsed = (System.currentTimeMillis() - start);
-        System.out.println(String.format("time elapsed:%d, rate:%fkb/s, words:%.2f/s", elapsed,
+        System.out.println(String.format("time elapsed:%d, rate:%fkb/s, sentences:%.2f/s", elapsed,
             (length * 1.0) / 1024.0f / (elapsed * 1.0 / 1000.0f), wordCount * 1000.0f / (elapsed * 1.0)));
     }
 }