Skip to content

Commit

Permalink
Stylistic changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Wozacosta committed Dec 6, 2018
1 parent 830b7b3 commit 9ad4829
Showing 1 changed file with 81 additions and 84 deletions.
165 changes: 81 additions & 84 deletions lib/classificator.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ const STATE_KEYS = (module.exports.STATE_KEYS = [
'wordFrequencyCount',
'options',
]);
const DEFAULT_ALPHA = 1
const DEFAULT_FIT_PRIOR = true
const DEFAULT_ALPHA = 1;
const DEFAULT_FIT_PRIOR = true;

/**
* Initializes a NaiveBayes instance from a JSON state representation.
Expand All @@ -29,7 +29,7 @@ const DEFAULT_FIT_PRIOR = true
* @param {String|Object} jsonStrOrObject state representation obtained by classifier.toJson()
* @return {NaiveBayes} Classifier
*/
module.exports.fromJson = jsonStrOrObject => {
module.exports.fromJson = (jsonStrOrObject) => {
let parameters;

try {
Expand All @@ -47,14 +47,14 @@ module.exports.fromJson = jsonStrOrObject => {
}
} catch (e) {
console.error(e);
throw new Error('Naivebays.fromJson expects a valid JSON string or an object.')
throw new Error('Naivebays.fromJson expects a valid JSON string or an object.');
}

// init a new classifier
let classifier = new Naivebayes(parameters.options);
const classifier = new Naivebayes(parameters.options);

// override the classifier's state
STATE_KEYS.forEach(k => {
STATE_KEYS.forEach((k) => {
if (!parameters[k]) {
throw new Error(
`Naivebayes.fromJson: JSON string is missing an expected property: [${k}].`
Expand All @@ -73,11 +73,11 @@ module.exports.fromJson = jsonStrOrObject => {
* @param {String} text
* @return {Array}
*/
const defaultTokenizer = text => {
//remove punctuation from text - remove anything that isn't a word char or a space
let rgxPunctuation = /[^(a-zA-ZA-Яa-я0-9_)+\s]/g;
const defaultTokenizer = (text) => {
// remove punctuation from text - remove anything that isn't a word char or a space
const rgxPunctuation = /[^(a-zA-ZA-Яa-я0-9_)+\s]/g;

let sanitized = text.replace(rgxPunctuation, ' ');
const sanitized = text.replace(rgxPunctuation, ' ');
// tokens = tokens.filter(function(token) {
// return token.length >= _that.config.minimumLength;
// });
Expand Down Expand Up @@ -107,27 +107,27 @@ function Naivebayes(options) {
}

this.tokenizer = this.options.tokenizer || defaultTokenizer;
this.alpha = this.options.alpha || DEFAULT_ALPHA
this.fitPrior = this.options.fitPrior === undefined ? DEFAULT_FIT_PRIOR : this.options.fitPrior
//initialize our vocabulary and its size
this.alpha = this.options.alpha || DEFAULT_ALPHA;
this.fitPrior = this.options.fitPrior === undefined ? DEFAULT_FIT_PRIOR : this.options.fitPrior;
// initialize our vocabulary and its size
this.vocabulary = {};
this.vocabularySize = 0;

//number of documents we have learned from
// number of documents we have learned from
this.totalDocuments = 0;

//document frequency table for each of our categories
//=> for each category, how often were documents mapped to it
// document frequency table for each of our categories
//= > for each category, how often were documents mapped to it
this.docCount = {};

//for each category, how many words total were mapped to it
// for each category, how many words total were mapped to it
this.wordCount = {};

//word frequency table for each category
//=> for each category, how frequent was a given word mapped to it
// word frequency table for each category
//= > for each category, how frequent was a given word mapped to it
this.wordFrequencyCount = {};

//hashmap of our category names
// hashmap of our category names
this.categories = {};
}

Expand All @@ -153,13 +153,13 @@ Naivebayes.prototype.initializeCategory = function(categoryName) {
* @param {String} categoryName
*/
Naivebayes.prototype.removeCategory = function(categoryName) {
if (!this.categories[categoryName]){
if (!this.categories[categoryName]) {
return this;
}
//update the total number of documents we have learned from
// update the total number of documents we have learned from
this.totalDocuments -= this.docCount[categoryName];

Object.keys(this.wordFrequencyCount[categoryName]).forEach(token => {
Object.keys(this.wordFrequencyCount[categoryName]).forEach((token) => {
this.vocabulary[token]--;
if (this.vocabulary[token] === 0) this.vocabularySize--;
});
Expand All @@ -180,26 +180,25 @@ Naivebayes.prototype.removeCategory = function(categoryName) {
* @param {String} category Category to learn as being text
*/
Naivebayes.prototype.learn = function(text, category) {
//initialize category data structures if we've never seen this category
// initialize category data structures if we've never seen this category
this.initializeCategory(category);

//update our count of how many documents mapped to this category
// update our count of how many documents mapped to this category
this.docCount[category]++;

//update the total number of documents we have learned from
// update the total number of documents we have learned from
this.totalDocuments++;

//normalize the text into a word array
let tokens = this.tokenizer(text);

//get a frequency count for each token in the text
let frequencyTable = this.frequencyTable(tokens);
// normalize the text into a word array
const tokens = this.tokenizer(text);

Object.keys(frequencyTable).forEach(token => {
// get a frequency count for each token in the text
const frequencyTable = this.frequencyTable(tokens);

let frequencyInText = frequencyTable[token];
Object.keys(frequencyTable).forEach((token) => {
const frequencyInText = frequencyTable[token];

//add this word to our vocabulary if not already existing
// add this word to our vocabulary if not already existing
if (!this.vocabulary[token] || this.vocabulary[token] === 0) {
this.vocabularySize++;
this.vocabulary[token] = 1;
Expand All @@ -210,13 +209,12 @@ Naivebayes.prototype.learn = function(text, category) {
}


//update the frequency information for this word in this category
// update the frequency information for this word in this category
if (!this.wordFrequencyCount[category][token]) {
this.wordFrequencyCount[category][token] = frequencyInText;
}
else this.wordFrequencyCount[category][token] += frequencyInText;
} else this.wordFrequencyCount[category][token] += frequencyInText;

//update the count of all words we have seen mapped to this category
// update the count of all words we have seen mapped to this category
this.wordCount[category] += frequencyInText;
});

Expand All @@ -230,45 +228,44 @@ Naivebayes.prototype.learn = function(text, category) {
* @param {String} text
* @param {String} category Category to unlearn as being text
*/
Naivebayes.prototype.unlearn = function(text, category){
//update our count of how many documents mapped to this category
Naivebayes.prototype.unlearn = function(text, category) {
// update our count of how many documents mapped to this category
this.docCount[category]--;
if (this.docCount[category] === 0){
if (this.docCount[category] === 0) {
delete this.docCount[category];
}

//update the total number of documents we have learned from
// update the total number of documents we have learned from
this.totalDocuments--;

//normalize the text into a word array
let tokens = this.tokenizer(text);
// normalize the text into a word array
const tokens = this.tokenizer(text);

//get a frequency count for each token in the text
let frequencyTable = this.frequencyTable(tokens);
// get a frequency count for each token in the text
const frequencyTable = this.frequencyTable(tokens);

/*
Update our vocabulary and our word frequency count for this category
*/

Object.keys(frequencyTable).forEach(token => {

let frequencyInText = frequencyTable[token];
Object.keys(frequencyTable).forEach((token) => {
const frequencyInText = frequencyTable[token];

//add this word to our vocabulary if not already existing
// add this word to our vocabulary if not already existing
if (this.vocabulary[token] && this.vocabulary[token] > 0) {
this.vocabulary[token] -= frequencyInText;
if (this.vocabulary[token] === 0) this.vocabularySize--;
}


this.wordFrequencyCount[category][token] -= frequencyInText;
if (this.wordFrequencyCount[category][token] === 0){
if (this.wordFrequencyCount[category][token] === 0) {
delete this.wordFrequencyCount[category][token];
}

//update the count of all words we have seen mapped to this category
// update the count of all words we have seen mapped to this category
this.wordCount[category] -= frequencyInText;
if (this.wordCount[category] === 0){
if (this.wordCount[category] === 0) {
delete this.wordCount[category];
delete this.wordFrequencyCount[category];
}
Expand All @@ -285,40 +282,40 @@ Naivebayes.prototype.unlearn = function(text, category){
*
* @return {Object} The predicted category, and the likelihoods stats.
*/
Naivebayes.prototype.categorize = function(text){
Naivebayes.prototype.categorize = function(text) {
const tokens = this.tokenizer(text);
const frequencyTable = this.frequencyTable(tokens);
const categories = Object.keys(this.categories)
const categories = Object.keys(this.categories);
const likelihoods = [];

// iterate through our categories to find the one with max probability for this text
categories.forEach(category => {
//start by calculating the overall probability of this category
//=> out of all documents we've ever looked at, how many were
categories.forEach((category) => {
// start by calculating the overall probability of this category
//= > out of all documents we've ever looked at, how many were
// mapped to this category
let categoryLikelihood
let categoryLikelihood;
if (this.fitPrior) {
categoryLikelihood = this.docCount[category] / this.totalDocuments;
} else {
categoryLikelihood = 1
categoryLikelihood = 1;
}

//take the log to avoid underflow
// take the log to avoid underflow
// let logLikelihood = Math.log(categoryLikelihood);
let logLikelihood = Decimal(categoryLikelihood);
logLikelihood = logLikelihood.naturalLogarithm();

//now determine P( w | c ) for each word `w` in the text
Object.keys(frequencyTable).forEach(token => {
// now determine P( w | c ) for each word `w` in the text
Object.keys(frequencyTable).forEach((token) => {
if (this.vocabulary[token] && this.vocabulary[token] > 0) {
let termFrequencyInText = frequencyTable[token];
let tokenProbability = this.tokenProbability(token, category);
const termFrequencyInText = frequencyTable[token];
const tokenProbability = this.tokenProbability(token, category);

// determine the log of the P( w | c ) for this word
// logLikelihood += termFrequencyInText * Math.log(tokenProbability);
let logTokenProbability = Decimal(tokenProbability);
logTokenProbability = logTokenProbability.naturalLogarithm();
logLikelihood = logLikelihood.plus(termFrequencyInText * logTokenProbability)
logLikelihood = logLikelihood.plus(termFrequencyInText * logTokenProbability);
}
});

Expand All @@ -328,33 +325,33 @@ Naivebayes.prototype.categorize = function(text){
likelihoods.push({ category, logLikelihood });
});

const logsumexp = likelihoods => {
const logsumexp = (likelihoods) => {
let sum = new Decimal(0);
likelihoods.forEach(likelihood => {
let x = Decimal(likelihood.logLikelihood);
let a = Decimal.exp(x);
likelihoods.forEach((likelihood) => {
const x = Decimal(likelihood.logLikelihood);
const a = Decimal.exp(x);
sum = sum.plus(a);
})
});

return sum.naturalLogarithm();
}
};

const logProbX = logsumexp(likelihoods)
likelihoods.forEach(likelihood => {
const logProbX = logsumexp(likelihoods);
likelihoods.forEach((likelihood) => {
likelihood.logProba = Decimal(likelihood.logLikelihood).minus(logProbX);
likelihood.proba = likelihood.logProba.naturalExponential();
likelihood.logProba = likelihood.logProba.toNumber();
likelihood.proba = likelihood.proba.toNumber();
likelihood.logLikelihood = likelihood.logLikelihood.toNumber();
})
});

// sort to have first element with biggest probability
likelihoods.sort((a, b) => b.proba - a.proba)
likelihoods.sort((a, b) => b.proba - a.proba);

return {
likelihoods,
predictedCategory: likelihoods[0].category
}
};
};

/**
Expand All @@ -364,14 +361,14 @@ Naivebayes.prototype.categorize = function(text){
* @param {String} category
* @return {Number} probability
*/
Naivebayes.prototype.tokenProbability = function(token, category){
//how many times this word has occurred in documents mapped to this category
Naivebayes.prototype.tokenProbability = function(token, category) {
// how many times this word has occurred in documents mapped to this category
const wordFrequencyCount = this.wordFrequencyCount[category][token] || 0;

//what is the count of all words that have ever been mapped to this category
// what is the count of all words that have ever been mapped to this category
const wordCount = this.wordCount[category];

//use laplace Add-1 Smoothing equation
// use laplace Add-1 Smoothing equation
return (wordFrequencyCount + this.alpha) / (wordCount + this.alpha * this.vocabularySize);
};

Expand All @@ -384,9 +381,9 @@ Naivebayes.prototype.tokenProbability = function(token, category){
* @return {Object}
*/
Naivebayes.prototype.frequencyTable = function(tokens) {
let frequencyTable = Object.create(null);
const frequencyTable = Object.create(null);

tokens.forEach(token => {
tokens.forEach((token) => {
if (!frequencyTable[token]) frequencyTable[token] = 1;
else frequencyTable[token]++;
});
Expand All @@ -399,7 +396,7 @@ Naivebayes.prototype.frequencyTable = function(tokens) {
* @return {String} Representation of the classifier.
*/
Naivebayes.prototype.toJson = function() {
let state = {};
const state = {};

STATE_KEYS.forEach(k => (state[k] = this[k]));

Expand Down

0 comments on commit 9ad4829

Please sign in to comment.