Skip to content

Commit

Permalink
merge develop
Browse files Browse the repository at this point in the history
  • Loading branch information
piaolingxue committed Mar 26, 2014
2 parents 8ef8c23 + e6487e1 commit f56461b
Show file tree
Hide file tree
Showing 5 changed files with 247 additions and 149 deletions.
4 changes: 3 additions & 1 deletion conf/user.dict
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@
显瘦 3
又拍云 3
iphone 3
鲜芋仙 3
鲜芋仙 3
UTF-8 3 nz
utf-8 3 nz
50 changes: 50 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,24 @@
<target>1.6</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>1.4</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
<configuration>
<keyname>Libin &lt;bin.li@upai.com&gt;</keyname>
</configuration>
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
Expand Down Expand Up @@ -99,6 +117,38 @@
</plugins>
</build>

<profiles>
<profile>
<id>release-sign-artifacts</id>
<activation>
<property>
<name>performRelease</name>
<value>true</value>
</property>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
<configuration>
<keyname>Libin &lt;bin.li@upai.com&gt;</keyname>
</configuration>
</plugin>
</plugins>
</build>
</profile>
</profiles>

<distributionManagement>
<repository>
<id>sonatype-nexus-staging</id>
Expand Down
62 changes: 40 additions & 22 deletions src/main/java/com/huaban/analysis/jieba/CharacterUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,54 +2,72 @@

import java.util.regex.Pattern;


public class CharacterUtil {
public static Pattern reSkip = Pattern.compile("(\\d+\\.\\d+|[a-zA-Z0-9]+)");
private static final char[] connectors = new char[] {'+', '#', '&', '.', '_'};

private static final char[] connectors = new char[] { '+', '#', '&', '.', '_', '-' };


public static boolean isChineseLetter(char ch) {
if (ch >= 0x4E00 && ch <= 0x9FA5) return true;
if (ch >= 0x4E00 && ch <= 0x9FA5)
return true;
return false;
}

}


public static boolean isEnglishLetter(char ch) {
if ((ch >= 0x0041 && ch <= 0x005A) || (ch >= 0x0061 && ch <= 0x007A))
return true;
return false;
}



public static boolean isDigit(char ch) {
if (ch >= 0x0030 && ch <= 0x0039) return true;
if (ch >= 0x0030 && ch <= 0x0039)
return true;
return false;
}



public static boolean isConnector(char ch) {
for (char connector : connectors)
if (ch == connector) return true;
if (ch == connector)
return true;
return false;
}



public static boolean ccFind(char ch) {
if(isChineseLetter(ch)) return true;
if(isEnglishLetter(ch)) return true;
if(isDigit(ch)) return true;
if(isConnector(ch)) return true;
if (isChineseLetter(ch))
return true;
if (isEnglishLetter(ch))
return true;
if (isDigit(ch))
return true;
if (isConnector(ch))
return true;
return false;
}


/**
* 全角 to 半角,大写 to 小写
* @param input 输入字符
*
* @param input
* 输入字符
* @return 转换后的字符
*/
public static char regularize(char input){
public static char regularize(char input) {
if (input == 12288) {
return 32;
}else if (input > 65280 && input < 65375) {
}
else if (input > 65280 && input < 65375) {
return (char) (input - 65248);
}else if (input >= 'A' && input <= 'Z') {
return (input += 32);
}
}
else if (input >= 'A' && input <= 'Z') {
return (input += 32);
}
return input;
}
}

}
83 changes: 52 additions & 31 deletions src/main/java/com/huaban/analysis/jieba/WordDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,30 @@


public class WordDictionary {
private static WordDictionary singleInstance;
private static WordDictionary singleInstance = new WordDictionary();
private static final String MAIN_DICT = "/dict.txt";
private static String USER_DICT_SUFFIX = ".dict";

static {
singleInstance.loadDict();
}

public final TrieNode trie = new TrieNode();
public final Map<String, Word> freqs = new HashMap<String, Word>();
private Double minFreq = Double.MAX_VALUE;
private Double total = 0.0;
private static boolean isLoaded = false;

private WordDictionary() {}

public synchronized static WordDictionary getInstance() {
if (singleInstance == null) {
singleInstance = new WordDictionary();
singleInstance.loadDict();
}
private WordDictionary() {
}


public static WordDictionary getInstance() {
return singleInstance;
}


/**
* for ES to initialize the user dictionary.
*
Expand All @@ -50,17 +54,19 @@ public synchronized void init(File configFile) {
}
}


public void loadDict() {
InputStream is = this.getClass().getResourceAsStream(MAIN_DICT);
try {
BufferedReader br =
new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
BufferedReader br = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));

long s = System.currentTimeMillis();
while (br.ready()) {
String line = br.readLine();
String[] tokens = line.split("[\t ]+");
if (tokens.length < 3) continue;

if (tokens.length < 3)
continue;

String word = tokens[0];
String tokenType = tokens[2];
Expand All @@ -75,27 +81,31 @@ public void loadDict() {
minFreq = Math.min(entry.getValue().getFreq(), minFreq);
}
System.out.println(String.format("main dict load finished, time elapsed %d ms",
System.currentTimeMillis() - s));
} catch (IOException e) {
System.currentTimeMillis() - s));
}
catch (IOException e) {
System.err.println(String.format("%s load failure!", MAIN_DICT));
} finally {
}
finally {
try {
if (null != is) is.close();
} catch (IOException e) {
if (null != is)
is.close();
}
catch (IOException e) {
System.err.println(String.format("%s close failure!", MAIN_DICT));
}
}
}



private String addWord(String word) {
TrieNode p = this.trie;
StringBuilder r = new StringBuilder();
for (char ch : word.toCharArray()) {
ch = CharacterUtil.regularize(ch);
r.append(ch);
if (ch == ' ') continue;
if (ch == ' ')
continue;
TrieNode pChild = null;
if ((pChild = p.childs.get(ch)) == null) {
pChild = new TrieNode();
Expand All @@ -107,11 +117,13 @@ private String addWord(String word) {
return r.toString();
}


public void loadUserDict(File userDict) {
InputStream is;
try {
is = new FileInputStream(userDict);
} catch (FileNotFoundException e) {
}
catch (FileNotFoundException e) {
System.err.println(String.format("could not find %s", userDict.getAbsolutePath()));
return;
}
Expand All @@ -122,7 +134,9 @@ public void loadUserDict(File userDict) {
while (br.ready()) {
String line = br.readLine();
String[] tokens = line.split("[\t ]+");
if (tokens.length < 3) continue;

if (tokens.length < 3)
continue;

String word = tokens[0];
String tokenType = tokens[2];
Expand All @@ -131,37 +145,44 @@ public void loadUserDict(File userDict) {
freqs.put(word, Word.createWord(word, Math.log(freq / total), tokenType));
count++;
}
System.out.println(String.format(
"user dict %s load finished, tot words:%d, time elapsed:%dms",
userDict.getAbsolutePath(), count, System.currentTimeMillis() - s));
} catch (IOException e) {
System.err.println(String.format("%s: load user dict failure!",
userDict.getAbsolutePath()));
} finally {
System.out.println(String.format("user dict %s load finished, tot words:%d, time elapsed:%dms",
userDict.getAbsolutePath(), count, System.currentTimeMillis() - s));
}
catch (IOException e) {
System.err.println(String.format("%s: load user dict failure!", userDict.getAbsolutePath()));
}
finally {
try {
if (null != is) is.close();
} catch (IOException e) {
if (null != is)
is.close();
}
catch (IOException e) {
System.err.println(String.format("%s close failure!", userDict.getAbsolutePath()));
}
}
}


public TrieNode getTrie() {
return this.trie;
}


public boolean containsWord(String word) {
return freqs.containsKey(word);
}

public Word getWord(String token){
if(containsWord(token)){

public Word getWord(String token) {
if (containsWord(token)) {
return freqs.get(token);
} else {
}
else {
return null;
}
}


public Double getFreq(String key) {
if (containsWord(key))
return freqs.get(key).getFreq();
Expand Down
Loading

0 comments on commit f56461b

Please sign in to comment.