Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 066f99b

Browse files
committedApr 23, 2017
Update and optimize core file.
1 parent 68d9839 commit 066f99b

7 files changed

+27
-32
lines changed
 

‎README.md

+1
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ Returns a classifier instance from the JSON representation. Use this with the JS
203203
- [nodejieba](https://github.com/yanyiwu/nodejieba)
204204
- [node-segment](https://github.com/leizongmin/node-segment)
205205
- [china-address - 地址分词](https://github.com/booxood/china-address)
206+
- [word-picker](https://github.com/redhu/word-picker)
206207

207208
### 英文分词库:
208209
- [tokenize-text](https://github.com/GitbookIO/tokenize-text)

‎dist/naive-bayes.js

+11-14
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ function _classCallCheck(instance, Constructor) { if (!(instance instanceof Cons
88

99
// 用于重置分类器的键
1010
// keys we use to serialize a classifier's state
11-
var STATE_KEYS = module.exports.STATE_KEYS = ['categories', 'docCount', 'totalDocuments', 'vocabulary', 'vocabularySize', 'wordCount', 'wordFrequencyCount', 'options'];
11+
var STATE_KEYS = module.exports.STATE_KEYS = ['categories', 'docCount', 'totalDocuments', 'vocabulary', 'wordCount', 'wordFrequencyCount', 'options'];
1212

1313
/**
1414
* 默认分词器,英文按照空格分割单词,中文按照字符分割
@@ -56,10 +56,8 @@ var NaiveBayes = function () {
5656
// 分词器
5757
this.tokenizer = this.options.tokenizer || defaultTokenizer;
5858

59-
// 初始化词汇量和其大小
60-
// Initialize our vocabulary and its size.
61-
this.vocabulary = {};
62-
this.vocabularySize = 0;
59+
// 词汇表
60+
this.vocabulary = [];
6361

6462
// 已学习的文档总数量
6563
// number of documents we have learned from
@@ -79,7 +77,7 @@ var NaiveBayes = function () {
7977

8078
// 所有分类
8179
// hashmap of our category names
82-
this.categories = {};
80+
this.categories = [];
8381
}
8482

8583
/**
@@ -93,11 +91,11 @@ var NaiveBayes = function () {
9391
_createClass(NaiveBayes, [{
9492
key: 'initializeCategory',
9593
value: function initializeCategory(categoryName) {
96-
if (!this.categories[categoryName]) {
94+
if (!this.categories.includes(categoryName)) {
9795
this.docCount[categoryName] = 0;
9896
this.wordCount[categoryName] = 0;
9997
this.wordFrequencyCount[categoryName] = {};
100-
this.categories[categoryName] = true;
98+
this.categories.push(categoryName);
10199
}
102100
return this;
103101
}
@@ -142,11 +140,10 @@ var NaiveBayes = function () {
142140
*/
143141
Object.keys(frequencyTable).forEach(function (token) {
144142

145-
// 如果不是已经存在的话,把这个词添加到我们的词汇表中
143+
// 将目标词汇添加到词汇表
146144
// add this word to our vocabulary if not already existing
147-
if (!_this.vocabulary[token]) {
148-
_this.vocabulary[token] = true;
149-
_this.vocabularySize++;
145+
if (!_this.vocabulary.includes(token)) {
146+
_this.vocabulary.push(token);
150147
}
151148

152149
var frequencyInText = frequencyTable[token];
@@ -188,7 +185,7 @@ var NaiveBayes = function () {
188185
var frequencyTable = this.frequencyTable(tokens);
189186

190187
// P(W1|C) * P(W2|C) ... P(Wn|C) * P(C) 的最大值 = 遍历分类,找到一个最大概率
191-
Object.keys(this.categories).forEach(function (category) {
188+
this.categories.forEach(function (category) {
192189

193190
// P(C)
194191
var categoryProbability = _this2.docCount[category] / _this2.totalDocuments;
@@ -237,7 +234,7 @@ var NaiveBayes = function () {
237234
var wordCount = this.wordCount[category];
238235

239236
// 拉普拉斯方程
240-
return (wordFrequencyCount + 1) / (wordCount + this.vocabularySize);
237+
return (wordFrequencyCount + 1) / (wordCount + this.vocabulary.length);
241238
}
242239

243240
/**

‎lib/naive-bayes.js

+11-14
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// 用于重置分类器的键
33
// keys we use to serialize a classifier's state
44
const STATE_KEYS = module.exports.STATE_KEYS = [
5-
'categories', 'docCount', 'totalDocuments', 'vocabulary', 'vocabularySize', 'wordCount', 'wordFrequencyCount', 'options'
5+
'categories', 'docCount', 'totalDocuments', 'vocabulary', 'wordCount', 'wordFrequencyCount', 'options'
66
]
77

88
/**
@@ -50,10 +50,8 @@ class NaiveBayes {
5050
// 分词器
5151
this.tokenizer = this.options.tokenizer || defaultTokenizer
5252

53-
// 初始化词汇量和其大小
54-
// Initialize our vocabulary and its size.
55-
this.vocabulary = {}
56-
this.vocabularySize = 0
53+
// 词汇表
54+
this.vocabulary = []
5755

5856
// 已学习的文档总数量
5957
// number of documents we have learned from
@@ -73,7 +71,7 @@ class NaiveBayes {
7371

7472
// 所有分类
7573
// hashmap of our category names
76-
this.categories = {}
74+
this.categories = []
7775
}
7876

7977

@@ -84,11 +82,11 @@ class NaiveBayes {
8482
* @param {String} categoryName
8583
*/
8684
initializeCategory(categoryName) {
87-
if (!this.categories[categoryName]) {
85+
if (!this.categories.includes(categoryName)) {
8886
this.docCount[categoryName] = 0
8987
this.wordCount[categoryName] = 0
9088
this.wordFrequencyCount[categoryName] = {}
91-
this.categories[categoryName] = true
89+
this.categories.push(categoryName)
9290
}
9391
return this
9492
}
@@ -129,11 +127,10 @@ class NaiveBayes {
129127
*/
130128
Object.keys(frequencyTable).forEach(token => {
131129

132-
// 如果不是已经存在的话,把这个词添加到我们的词汇表中
130+
// 将目标词汇添加到词汇表
133131
// add this word to our vocabulary if not already existing
134-
if (!this.vocabulary[token]) {
135-
this.vocabulary[token] = true
136-
this.vocabularySize++
132+
if (!this.vocabulary.includes(token)) {
133+
this.vocabulary.push(token)
137134
}
138135

139136
const frequencyInText = frequencyTable[token]
@@ -171,7 +168,7 @@ class NaiveBayes {
171168
var frequencyTable = this.frequencyTable(tokens)
172169

173170
// P(W1|C) * P(W2|C) ... P(Wn|C) * P(C) 的最大值 = 遍历分类,找到一个最大概率
174-
Object.keys(this.categories).forEach(category => {
171+
this.categories.forEach(category => {
175172

176173
// P(C)
177174
const categoryProbability = this.docCount[category] / this.totalDocuments
@@ -217,7 +214,7 @@ class NaiveBayes {
217214
const wordCount = this.wordCount[category]
218215

219216
// 拉普拉斯方程
220-
return ( wordFrequencyCount + 1 ) / ( wordCount + this.vocabularySize )
217+
return ( wordFrequencyCount + 1 ) / ( wordCount + this.vocabulary.length )
221218
}
222219

223220
/**

‎package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "naivebayes",
3-
"version": "0.3.0",
3+
"version": "0.4.0",
44
"description": "Naive Bayes Classifier for node.js.",
55
"main": "dist/naive-bayes.js",
66
"scripts": {

‎test/base.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -96,4 +96,4 @@ console.log('预期:正常,实际:', classifier.categorize('还没使用
9696

9797
// 保存学习进度
9898
const classifierJson = classifier.toJson()
99-
fs.writeFileSync('./examples/classifierJson.json', JSON.stringify(classifierJson))
99+
fs.writeFileSync('./test/classifierJson.json', JSON.stringify(classifierJson))

‎test/classifierJson.json

+1-1
Large diffs are not rendered by default.

‎test/incremental.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,4 +62,4 @@ console.log('预期:正常,实际:', classifier.categorize('马克思主
6262

6363
// 保存学习进度
6464
classifierJson = classifier.toJson()
65-
fs.writeFileSync('./examples/classifierJson.json', JSON.stringify(classifierJson))
65+
fs.writeFileSync('./test/classifierJson.json', JSON.stringify(classifierJson))

0 commit comments

Comments
 (0)
Please sign in to comment.