Update and optimize core file.

surmon-china · surmon-china · commit 066f99ba5539 · 2017-04-24T00:17:39.000+08:00
diff --git a/README.md b/README.md
@@ -203,6 +203,7 @@ Returns a classifier instance from the JSON representation. Use this with the JS
 - [nodejieba](https://github.com/yanyiwu/nodejieba)
 - [node-segment](https://github.com/leizongmin/node-segment)
 - [china-address - 地址分词](https://github.com/booxood/china-address)
+- [word-picker](https://github.com/redhu/word-picker)
 
 ### 英文分词库：
 - [tokenize-text](https://github.com/GitbookIO/tokenize-text)
diff --git a/dist/naive-bayes.js b/dist/naive-bayes.js
@@ -8,7 +8,7 @@ function _classCallCheck(instance, Constructor) { if (!(instance instanceof Cons
 
 // 用于重置分类器的键
 // keys we use to serialize a classifier's state
-var STATE_KEYS = module.exports.STATE_KEYS = ['categories', 'docCount', 'totalDocuments', 'vocabulary', 'vocabularySize', 'wordCount', 'wordFrequencyCount', 'options'];
+var STATE_KEYS = module.exports.STATE_KEYS = ['categories', 'docCount', 'totalDocuments', 'vocabulary', 'wordCount', 'wordFrequencyCount', 'options'];
 
 /**
  * 默认分词器，英文按照空格分割单词，中文按照字符分割
@@ -56,10 +56,8 @@ var NaiveBayes = function () {
     // 分词器
     this.tokenizer = this.options.tokenizer || defaultTokenizer;
 
-    // 初始化词汇量和其大小
-    // Initialize our vocabulary and its size.
-    this.vocabulary = {};
-    this.vocabularySize = 0;
+    // 词汇表
+    this.vocabulary = [];
 
     // 已学习的文档总数量
     // number of documents we have learned from
@@ -79,7 +77,7 @@ var NaiveBayes = function () {
 
     // 所有分类
     // hashmap of our category names
-    this.categories = {};
+    this.categories = [];
   }
 
   /**
@@ -93,11 +91,11 @@ var NaiveBayes = function () {
   _createClass(NaiveBayes, [{
     key: 'initializeCategory',
     value: function initializeCategory(categoryName) {
-      if (!this.categories[categoryName]) {
+      if (!this.categories.includes(categoryName)) {
         this.docCount[categoryName] = 0;
         this.wordCount[categoryName] = 0;
         this.wordFrequencyCount[categoryName] = {};
-        this.categories[categoryName] = true;
+        this.categories.push(categoryName);
       }
       return this;
     }
@@ -142,11 +140,10 @@ var NaiveBayes = function () {
        */
       Object.keys(frequencyTable).forEach(function (token) {
 
-        // 如果不是已经存在的话，把这个词添加到我们的词汇表中
+        // 将目标词汇添加到词汇表
         // add this word to our vocabulary if not already existing
-        if (!_this.vocabulary[token]) {
-          _this.vocabulary[token] = true;
-          _this.vocabularySize++;
+        if (!_this.vocabulary.includes(token)) {
+          _this.vocabulary.push(token);
         }
 
         var frequencyInText = frequencyTable[token];
@@ -188,7 +185,7 @@ var NaiveBayes = function () {
       var frequencyTable = this.frequencyTable(tokens);
 
       // P(W1|C) * P(W2|C) ... P(Wn|C) * P(C) 的最大值 = 遍历分类，找到一个最大概率
-      Object.keys(this.categories).forEach(function (category) {
+      this.categories.forEach(function (category) {
 
         // P(C)
         var categoryProbability = _this2.docCount[category] / _this2.totalDocuments;
@@ -237,7 +234,7 @@ var NaiveBayes = function () {
       var wordCount = this.wordCount[category];
 
       // 拉普拉斯方程
-      return (wordFrequencyCount + 1) / (wordCount + this.vocabularySize);
+      return (wordFrequencyCount + 1) / (wordCount + this.vocabulary.length);
     }
 
     /**
diff --git a/lib/naive-bayes.js b/lib/naive-bayes.js
@@ -2,7 +2,7 @@
 // 用于重置分类器的键
 // keys we use to serialize a classifier's state
 const STATE_KEYS = module.exports.STATE_KEYS = [
-  'categories', 'docCount', 'totalDocuments', 'vocabulary', 'vocabularySize', 'wordCount', 'wordFrequencyCount', 'options'
+  'categories', 'docCount', 'totalDocuments', 'vocabulary', 'wordCount', 'wordFrequencyCount', 'options'
 ]
 
 /**
@@ -50,10 +50,8 @@ class NaiveBayes {
     // 分词器
     this.tokenizer = this.options.tokenizer || defaultTokenizer
 
-    // 初始化词汇量和其大小
-    // Initialize our vocabulary and its size.
-    this.vocabulary = {}
-    this.vocabularySize = 0
+    // 词汇表
+    this.vocabulary = []
 
     // 已学习的文档总数量
     // number of documents we have learned from
@@ -73,7 +71,7 @@ class NaiveBayes {
 
     // 所有分类
     // hashmap of our category names
-    this.categories = {}
+    this.categories = []
   }
 
 
@@ -84,11 +82,11 @@ class NaiveBayes {
    * @param  {String} categoryName
    */
   initializeCategory(categoryName) {
-    if (!this.categories[categoryName]) {
+    if (!this.categories.includes(categoryName)) {
       this.docCount[categoryName] = 0
       this.wordCount[categoryName] = 0
       this.wordFrequencyCount[categoryName] = {}
-      this.categories[categoryName] = true
+      this.categories.push(categoryName)
     }
     return this
   }
@@ -129,11 +127,10 @@ class NaiveBayes {
      */
     Object.keys(frequencyTable).forEach(token => {
 
-      // 如果不是已经存在的话，把这个词添加到我们的词汇表中
+      // 将目标词汇添加到词汇表
       // add this word to our vocabulary if not already existing
-      if (!this.vocabulary[token]) {
-        this.vocabulary[token] = true
-        this.vocabularySize++
+      if (!this.vocabulary.includes(token)) {
+        this.vocabulary.push(token)
       }
 
       const frequencyInText = frequencyTable[token]
@@ -171,7 +168,7 @@ class NaiveBayes {
     var frequencyTable = this.frequencyTable(tokens)
 
     // P(W1|C) * P(W2|C) ... P(Wn|C) * P(C) 的最大值 = 遍历分类，找到一个最大概率
-    Object.keys(this.categories).forEach(category => {
+    this.categories.forEach(category => {
 
       // P(C)
       const categoryProbability = this.docCount[category] / this.totalDocuments
@@ -217,7 +214,7 @@ class NaiveBayes {
     const wordCount = this.wordCount[category]
 
     // 拉普拉斯方程
-    return ( wordFrequencyCount + 1 ) / ( wordCount + this.vocabularySize )
+    return ( wordFrequencyCount + 1 ) / ( wordCount + this.vocabulary.length )
   }
 
   /**
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "naivebayes",
-  "version": "0.3.0",
+  "version": "0.4.0",
   "description": "Naive Bayes Classifier for node.js.",
   "main": "dist/naive-bayes.js",
   "scripts": {
diff --git a/test/base.js b/test/base.js
@@ -96,4 +96,4 @@ console.log('预期：正常，实际：', classifier.categorize('还没使用
 
 // 保存学习进度
 const classifierJson = classifier.toJson()
-fs.writeFileSync('./examples/classifierJson.json', JSON.stringify(classifierJson))
+fs.writeFileSync('./test/classifierJson.json', JSON.stringify(classifierJson))
diff --git a/test/classifierJson.json b/test/classifierJson.json
diff --git a/test/incremental.js b/test/incremental.js
@@ -62,4 +62,4 @@ console.log('预期：正常，实际：', classifier.categorize('马克思主
 
 // 保存学习进度
 classifierJson = classifier.toJson()
-fs.writeFileSync('./examples/classifierJson.json', JSON.stringify(classifierJson))
+fs.writeFileSync('./test/classifierJson.json', JSON.stringify(classifierJson))

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "naivebayes",`
`3`		`- "version": "0.3.0",`
	`3`	`+ "version": "0.4.0",`
`4`	`4`	`"description": "Naive Bayes Classifier for node.js.",`
`5`	`5`	`"main": "dist/naive-bayes.js",`
`6`	`6`	`"scripts": {`