Skip to content

Commit

Permalink
Merge pull request huaban#34 from shibing624/master
Browse files Browse the repository at this point in the history
添加词性标注功能 多谢 @shibing624
  • Loading branch information
piaolingxue authored Aug 12, 2016
2 parents a959944 + 89cf1c8 commit c098e0b
Show file tree
Hide file tree
Showing 16 changed files with 769 additions and 54 deletions.
24 changes: 20 additions & 4 deletions README.org
Original file line number Diff line number Diff line change
@@ -1,23 +1,39 @@
* alter by xuming 20160601
结巴分词(java版)只保留的原项目针对搜索引擎分词的功能(cut_for_index、cut_for_search).
词性标注,关键词提取没有实现(今后如用到,可以考虑实现)。

* 结巴分词(java版) jieba-analysis
首先感谢jieba分词原作者[[https://github.com/fxsjy][fxsjy]],没有他的无私贡献,我们也不会结识到结巴
分词,更不会有现在的java版本。

结巴分词的原始版本为python编写,目前该项目在github上的关注量为170,
打星727次(最新的数据以原仓库为准),Fork238次,可以说已经有一定的用户群。

结巴分词(java版)只保留的原项目针对搜索引擎分词的功能(cut_for_index、cut_for_search),词性标注,关键词提取没有实现(今后如用到,可以考虑实现)。


* 简介
** 支持分词模式
- Search模式,用于对用户查询词分词
- Index模式,用于对索引文档分词
- Search模式,用于对用户查询词分词SegMode.SEARCH
- Index模式,用于对索引文档分词SegMode.INDEX 在Search模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词

** python支持三种分词模式:
- 精确模式,试图将句子最精确地切开,适合文本分析;
- 全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义;
- 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。

** 特性
- 支持多种分词模式
- 全角统一转成半角
- 用户词典功能
- conf 目录有整理的搜狗细胞词库
- 因为性能原因,最新的快照版本去除词性标注,也希望有更好的 Pull Request 可以提供该功能。
- 因为性能原因,最新的快照版本去除词性标注,也希望有更好的 Pull Request 可以提供该功能
- 支持繁体分词
- 支持自定义词典

** 算法
- 基于前缀词典实现高效的词图扫描,生成句子中汉字所有可能成词情况所构成的有向无环图 (DAG)
- 采用了动态规划查找最大概率路径, 找出基于词频的最大切分组合
- 对于未登录词,采用了基于汉字成词能力的 HMM 模型,使用了 Viterbi 算法

* 如何获取
- 当前稳定版本
Expand Down
5 changes: 5 additions & 0 deletions conf/user.dict
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,8 @@ iphone 3
鲜芋仙 3
UTF-8 3 nz
utf-8 3 nz
簡體字 53 n
簡體字典 53 n
矿泉水瓶盖 53 n
点赞 3 nz

60 changes: 28 additions & 32 deletions src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package com.huaban.analysis.jieba;

import com.huaban.analysis.jieba.viterbi.FinalSeg;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.huaban.analysis.jieba.viterbi.FinalSeg;


public class JiebaSegmenter {
private static WordDictionary wordDict = WordDictionary.getInstance();
Expand All @@ -32,17 +32,15 @@ private Map<Integer, List<Integer>> createDAG(String sentence) {
List<Integer> value = new ArrayList<Integer>();
dag.put(i, value);
value.add(j);
}
else
} else
dag.get(i).add(j);
}
j += 1;
if (j >= N) {
i += 1;
j = i;
}
}
else {
} else {
i += 1;
j = i;
}
Expand All @@ -68,8 +66,7 @@ private Map<Integer, Pair<Integer>> calc(String sentence, Map<Integer, List<Inte
double freq = wordDict.getFreq(sentence.substring(i, x + 1)) + route.get(x + 1).freq;
if (null == candidate) {
candidate = new Pair<Integer>(x, freq);
}
else if (candidate.freq < freq) {
} else if (candidate.freq < freq) {
candidate.freq = freq;
candidate.key = x;
}
Expand All @@ -93,18 +90,17 @@ public List<SegToken> process(String paragraph, SegMode mode) {
// process
if (mode == SegMode.SEARCH) {
for (String word : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(word, offset, offset += word.length()));
tokens.add(new SegToken(word, offset, offset += word.length(), wordDict.getNature(word)));
}
}
else {
} else {
for (String token : sentenceProcess(sb.toString())) {
if (token.length() > 2) {
String gram2;
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.substring(j, j + 2);
if (wordDict.containsWord(gram2))
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
tokens.add(new SegToken(gram2, offset + j, offset + j + 2, wordDict.getNature(gram2)));
}
}
if (token.length() > 3) {
Expand All @@ -113,36 +109,35 @@ public List<SegToken> process(String paragraph, SegMode mode) {
for (; j < token.length() - 2; ++j) {
gram3 = token.substring(j, j + 3);
if (wordDict.containsWord(gram3))
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
tokens.add(new SegToken(gram3, offset + j, offset + j + 3, wordDict.getNature(gram3)));
}
}
tokens.add(new SegToken(token, offset, offset += token.length()));
tokens.add(new SegToken(token, offset, offset += token.length(), wordDict.getNature(token)));
}
}
sb = new StringBuilder();
offset = i;
}
if (wordDict.containsWord(paragraph.substring(i, i + 1)))
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset, wordDict.getNature(paragraph)));
else
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset, wordDict.getNature(paragraph)));
}
}
if (sb.length() > 0)
if (mode == SegMode.SEARCH) {
for (String token : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(token, offset, offset += token.length()));
tokens.add(new SegToken(token, offset, offset += token.length(), wordDict.getNature(token)));
}
}
else {
} else {
for (String token : sentenceProcess(sb.toString())) {
if (token.length() > 2) {
String gram2;
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.substring(j, j + 2);
if (wordDict.containsWord(gram2))
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
tokens.add(new SegToken(gram2, offset + j, offset + j + 2, wordDict.getNature(gram2)));
}
}
if (token.length() > 3) {
Expand All @@ -151,19 +146,24 @@ public List<SegToken> process(String paragraph, SegMode mode) {
for (; j < token.length() - 2; ++j) {
gram3 = token.substring(j, j + 3);
if (wordDict.containsWord(gram3))
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
tokens.add(new SegToken(gram3, offset + j, offset + j + 3, wordDict.getNature(gram3)));
}
}
tokens.add(new SegToken(token, offset, offset += token.length()));
tokens.add(new SegToken(token, offset, offset += token.length(), wordDict.getNature(token)));
}
}

return tokens;
}


/*
*
/**
* @param
* @return
* @Title: 单独对每句话分词处理
* @Author: xuming
* @Description:
* @date:2016/6/7 11:17
*/
public List<String> sentenceProcess(String sentence) {
List<String> tokens = new ArrayList<String>();
Expand All @@ -186,12 +186,10 @@ public List<String> sentenceProcess(String sentence) {
sb = new StringBuilder();
if (buf.length() == 1) {
tokens.add(buf);
}
else {
} else {
if (wordDict.containsWord(buf)) {
tokens.add(buf);
}
else {
} else {
finalSeg.cut(buf, tokens);
}
}
Expand All @@ -204,12 +202,10 @@ public List<String> sentenceProcess(String sentence) {
if (buf.length() > 0) {
if (buf.length() == 1) {
tokens.add(buf);
}
else {
} else {
if (wordDict.containsWord(buf)) {
tokens.add(buf);
}
else {
} else {
finalSeg.cut(buf, tokens);
}
}
Expand Down
14 changes: 13 additions & 1 deletion src/main/java/com/huaban/analysis/jieba/SegToken.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,29 @@ public class SegToken {

public int endOffset;

public String nature;//add by xuming 20160607

public SegToken(String word, int startOffset, int endOffset) {
this.word = word;
this.startOffset = startOffset;
this.endOffset = endOffset;
}

public SegToken(String word, int startOffset, int endOffset,String nature) {
this.word = word;
this.startOffset = startOffset;
this.endOffset = endOffset;
this.nature = nature;
}

// @Override
// public String toString() {
// return "[" + word + ", " + startOffset + ", " + endOffset + "]";
// }

@Override
public String toString() {
return "[" + word + ", " + startOffset + ", " + endOffset + "]";
return word + "/" + nature;
}

}
20 changes: 19 additions & 1 deletion src/main/java/com/huaban/analysis/jieba/WordDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,17 @@ public class WordDictionary {
private static String USER_DICT_SUFFIX = ".dict";

public final Map<String, Double> freqs = new HashMap<String, Double>();
public final Map<String, String> natures = new HashMap<String, String>();
public final Set<String> loadedPath = new HashSet<String>();
private Double minFreq = Double.MAX_VALUE;
private Double total = 0.0;
private DictSegment _dict;


// 词性.需要在词性识别之后才会有值,默认是空
private String nature = "";


private WordDictionary() {
this.loadDict();
}
Expand Down Expand Up @@ -101,9 +106,11 @@ public void loadDict() {

String word = tokens[0];
double freq = Double.valueOf(tokens[1]);
String nature = String.valueOf(tokens[2]);
total += freq;
word = addWord(word);
freqs.put(word, freq);
natures.put(word, nature);
}
// normalize
for (Entry<String, Double> entry : freqs.entrySet()) {
Expand Down Expand Up @@ -167,7 +174,7 @@ public void loadUserDict(Path userDict, Charset charset) {
freqs.put(word, Math.log(freq / total));
count++;
}
System.out.println(String.format(Locale.getDefault(), "user dict %s load finished, tot words:%d, time elapsed:%dms", userDict.toString(), count, System.currentTimeMillis() - s));
System.out.println(String.format(Locale.getDefault(), "user dict %s load finished, total words:%d, time elapsed:%dms", userDict.toString(), count, System.currentTimeMillis() - s));
br.close();
}
catch (IOException e) {
Expand All @@ -185,11 +192,22 @@ public boolean containsWord(String word) {
return freqs.containsKey(word);
}

public boolean containsNature(String word) {
return natures.containsKey(word);
}


public Double getFreq(String key) {
if (containsWord(key))
return freqs.get(key);
else
return minFreq;
}

public String getNature(String key) {
if (containsNature(key))
return natures.get(key);
else
return "";
}
}
Loading

0 comments on commit c098e0b

Please sign in to comment.