Skip to content

Commit

Permalink
因为性能问题,暂时去除词性标注,希望有更好的 pull request 可以提供该功能
Browse files Browse the repository at this point in the history
  • Loading branch information
piaolingxue committed Aug 13, 2014
1 parent 6954ee6 commit be099cb
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 97 deletions.
52 changes: 25 additions & 27 deletions src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java
Original file line number Diff line number Diff line change
Expand Up @@ -92,27 +92,27 @@ public List<SegToken> process(String paragraph, SegMode mode) {
if (sb.length() > 0) {
// process
if (mode == SegMode.SEARCH) {
for (Word word : sentenceProcess(sb.toString())) {
for (String word : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(word, offset, offset += word.length()));
}
}
else {
for (Word token : sentenceProcess(sb.toString())) {
for (String token : sentenceProcess(sb.toString())) {
if (token.length() > 2) {
Word gram2;
String gram2;
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.subSequence(j, j + 2);
if (wordDict.containsWord(gram2.getToken()))
gram2 = token.substring(j, j + 2);
if (wordDict.containsWord(gram2))
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
}
}
if (token.length() > 3) {
Word gram3;
String gram3;
int j = 0;
for (; j < token.length() - 2; ++j) {
gram3 = token.subSequence(j, j + 3);
if (wordDict.containsWord(gram3.getToken()))
gram3 = token.substring(j, j + 3);
if (wordDict.containsWord(gram3))
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
}
}
Expand All @@ -123,36 +123,34 @@ public List<SegToken> process(String paragraph, SegMode mode) {
offset = i;
}
if (wordDict.containsWord(paragraph.substring(i, i + 1)))
tokens
.add(new SegToken(wordDict.getWord(paragraph.substring(i, i + 1)), offset, ++offset));
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
else
tokens.add(new SegToken(wordDict.createWord(paragraph.substring(i, i + 1)), offset,
++offset));
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
}
}
if (sb.length() > 0)
if (mode == SegMode.SEARCH) {
for (Word token : sentenceProcess(sb.toString())) {
for (String token : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(token, offset, offset += token.length()));
}
}
else {
for (Word token : sentenceProcess(sb.toString())) {
for (String token : sentenceProcess(sb.toString())) {
if (token.length() > 2) {
Word gram2;
String gram2;
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.subSequence(j, j + 2);
if (wordDict.containsWord(gram2.getToken()))
gram2 = token.substring(j, j + 2);
if (wordDict.containsWord(gram2))
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
}
}
if (token.length() > 3) {
Word gram3;
String gram3;
int j = 0;
for (; j < token.length() - 2; ++j) {
gram3 = token.subSequence(j, j + 3);
if (wordDict.containsWord(gram3.getToken()))
gram3 = token.substring(j, j + 3);
if (wordDict.containsWord(gram3))
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
}
}
Expand All @@ -167,8 +165,8 @@ public List<SegToken> process(String paragraph, SegMode mode) {
/*
*
*/
public List<Word> sentenceProcess(String sentence) {
List<Word> tokens = new ArrayList<Word>();
public List<String> sentenceProcess(String sentence) {
List<String> tokens = new ArrayList<String>();
int N = sentence.length();
Map<Integer, List<Integer>> dag = createDAG(sentence);
Map<Integer, Pair<Integer>> route = calc(sentence, dag);
Expand All @@ -184,31 +182,31 @@ public List<Word> sentenceProcess(String sentence) {
else {
if (buf.length() > 0) {
if (buf.length() == 1) {
tokens.add(wordDict.createWord(buf));
tokens.add(buf.toString());
buf = "";
}
else {
if (wordDict.containsWord(buf)) {
tokens.add(wordDict.getWord(buf));
tokens.add(buf.toString());
}
else {
finalSeg.cut(buf, tokens);
}
buf = "";
}
}
tokens.add(wordDict.createWord(lWord));
tokens.add(lWord);
}
x = y;
}
if (buf.length() > 0) {
if (buf.length() == 1) {
tokens.add(wordDict.createWord(buf));
tokens.add(buf.toString());
buf = "";
}
else {
if (wordDict.containsWord(buf)) {
tokens.add(wordDict.getWord(buf));
tokens.add(buf.toString());
}
else {
finalSeg.cut(buf, tokens);
Expand Down
12 changes: 4 additions & 8 deletions src/main/java/com/huaban/analysis/jieba/SegToken.java
Original file line number Diff line number Diff line change
@@ -1,27 +1,23 @@
package com.huaban.analysis.jieba;

import org.apache.commons.lang3.StringUtils;

public class SegToken {
public Word word;
public String word;

public int startOffset;

public int endOffset;


public SegToken(Word word, int startOffset, int endOffset) {
public SegToken(String word, int startOffset, int endOffset) {
this.word = word;
this.startOffset = startOffset;
this.endOffset = endOffset;
}


@Override
public String toString() {
if (StringUtils.isBlank(this.word.getTokenType()))
return "[" + this.word.getToken() + ", " + startOffset + ", " + endOffset + "]";
else
return "[" + this.word.getToken() + ", " + startOffset + ", " + endOffset + ", " + this.word.getTokenType() + "]";
return "[" + word + ", " + startOffset + ", " + endOffset + "]";
}

}
58 changes: 10 additions & 48 deletions src/main/java/com/huaban/analysis/jieba/WordDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public class WordDictionary {
private static final String MAIN_DICT = "/dict.txt";
private static String USER_DICT_SUFFIX = ".dict";

public final Map<String, Word> freqs = new HashMap<String, Word>();
public final Map<String, Double> freqs = new HashMap<String, Double>();
public final Set<String> loadedPath = new HashSet<String>();
private Double minFreq = Double.MAX_VALUE;
private Double total = 0.0;
Expand Down Expand Up @@ -77,20 +77,19 @@ public void loadDict() {
String line = br.readLine();
String[] tokens = line.split("[\t ]+");

if (tokens.length < 3)
if (tokens.length < 2)
continue;

String word = tokens[0];
String tokenType = tokens[2];
double freq = Double.valueOf(tokens[1]);
total += freq;
word = addWord(word);
freqs.put(word, createWord(word, freq, tokenType));
freqs.put(word, freq);
}
// normalize
for (Entry<String, Word> entry : freqs.entrySet()) {
entry.getValue().setFreq(Math.log(entry.getValue().getFreq() / total));
minFreq = Math.min(entry.getValue().getFreq(), minFreq);
for (Entry<String, Double> entry : freqs.entrySet()) {
entry.setValue((Math.log(entry.getValue() / total)));
minFreq = Math.min(entry.getValue(), minFreq);
}
System.out.println(String.format("main dict load finished, time elapsed %d ms",
System.currentTimeMillis() - s));
Expand Down Expand Up @@ -144,23 +143,13 @@ public void loadUserDict(File userDict, Charset charset) {
String line = br.readLine();
String[] tokens = line.split("[\t ]+");

if (tokens.length < 1)
if (tokens.length < 2)
continue;

String word = tokens[0];
double freq = Double.valueOf(tokens[1]);
word = addWord(word);
if (tokens.length == 1) {
freqs.put(word, createWord(word, Math.log(3.0 / total)));
}
else if (tokens.length == 2) {
double freq = Double.valueOf(tokens[1]);
freqs.put(word, createWord(word, Math.log(freq / total)));
}
else {
String tokenType = tokens[2];
double freq = Double.valueOf(tokens[1]);
freqs.put(word, createWord(word, Math.log(freq / total), tokenType));
}
freqs.put(word, Math.log(freq / total));
count++;
}
System.out.println(String.format("user dict %s load finished, tot words:%d, time elapsed:%dms",
Expand Down Expand Up @@ -191,37 +180,10 @@ public boolean containsWord(String word) {
}


public Word getWord(String token) {
if (containsWord(token)) {
return freqs.get(token);
}
else {
return null;
}
}


public Double getFreq(String key) {
if (containsWord(key))
return freqs.get(key).getFreq();
return freqs.get(key);
else
return minFreq;
}


public Word createWord(String token, Double freq, String tokenType) {
if (freqs.containsKey(token))
return freqs.get(token);
return new Word(token, freq, tokenType);
}


public Word createWord(String token, Double freq) {
return createWord(token, freq, "");
}


public Word createWord(String token) {
return createWord(token, 0.0, "");
}
}
21 changes: 9 additions & 12 deletions src/main/java/com/huaban/analysis/jieba/viterbi/FinalSeg.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@

import com.huaban.analysis.jieba.CharacterUtil;
import com.huaban.analysis.jieba.Pair;
import com.huaban.analysis.jieba.Word;
import com.huaban.analysis.jieba.WordDictionary;


public class FinalSeg {
Expand All @@ -26,7 +24,6 @@ public class FinalSeg {
private static Map<Character, Map<Character, Double>> trans;
private static Map<Character, char[]> prevStatus;
private static Double MIN_FLOAT = -3.14e100;;
private WordDictionary wordDict = WordDictionary.getInstance();


private FinalSeg() {
Expand Down Expand Up @@ -108,7 +105,7 @@ private void loadModel() {
}


public void cut(String sentence, List<Word> tokens) {
public void cut(String sentence, List<String> tokens) {
StringBuilder chinese = new StringBuilder();
StringBuilder other = new StringBuilder();
for (int i = 0; i < sentence.length(); ++i) {
Expand Down Expand Up @@ -137,7 +134,7 @@ public void cut(String sentence, List<Word> tokens) {
}


public void viterbi(String sentence, List<Word> tokens) {
public void viterbi(String sentence, List<String> tokens) {
Vector<Map<Character, Double>> v = new Vector<Map<Character, Double>>();
Map<Character, Vector<Character>> path = new HashMap<Character, Vector<Character>>();

Expand Down Expand Up @@ -194,30 +191,30 @@ else if (candidate.freq <= tranp) {
if (pos == 'B')
begin = i;
else if (pos == 'E') {
tokens.add(wordDict.createWord(sentence.substring(begin, i + 1)));
tokens.add(sentence.substring(begin, i + 1));
next = i + 1;
}
else if (pos == 'S') {
tokens.add(wordDict.createWord(sentence.substring(i, i + 1)));
tokens.add(sentence.substring(i, i + 1));
next = i + 1;
}
}
if (next < sentence.length())
tokens.add(wordDict.createWord(sentence.substring(next)));
tokens.add(sentence.substring(next));
}


private void processOtherUnknownWords(String other, List<Word> tokens) {
private void processOtherUnknownWords(String other, List<String> tokens) {
Matcher mat = CharacterUtil.reSkip.matcher(other);
int offset = 0;
while (mat.find()) {
if (mat.start() > offset) {
tokens.add(wordDict.createWord(other.substring(offset, mat.start())));
tokens.add(other.substring(offset, mat.start()));
}
tokens.add(wordDict.createWord(mat.group()));
tokens.add(mat.group());
offset = mat.end();
}
if (offset < other.length())
tokens.add(wordDict.createWord(other.substring(offset)));
tokens.add(other.substring(offset));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -163,14 +163,14 @@ public void testSegmentSpeed() {
long length = 0L;
long wordCount = 0L;
long start = System.currentTimeMillis();
for (int i = 0; i < 20000; ++i)
for (int i = 0; i < 2000; ++i)
for (String sentence : sentences) {
segmenter.process(sentence, SegMode.INDEX);
length += sentence.getBytes().length;
wordCount += sentence.length();
}
long elapsed = (System.currentTimeMillis() - start);
System.out.println(String.format("time elapsed:%d, rate:%fkb/s, words:%.2f/s", elapsed,
System.out.println(String.format("time elapsed:%d, rate:%fkb/s, sentences:%.2f/s", elapsed,
(length * 1.0) / 1024.0f / (elapsed * 1.0 / 1000.0f), wordCount * 1000.0f / (elapsed * 1.0)));
}
}

0 comments on commit be099cb

Please sign in to comment.