Add feature_log_prob getter for BernoulliNB

morenol · morenol · commit 64548dcbebb4 · 2021-01-22T19:50:24.000-04:00
diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs
@@ -33,6 +33,8 @@
 //! ## References:
 //!
 //! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html)
+use std::ops::Not;
+
 use crate::api::{Predictor, SupervisedEstimator};
 use crate::error::Failed;
 use crate::linalg::row_iter;
@@ -47,12 +49,26 @@ use serde::{Deserialize, Serialize};
 
 /// Naive Bayes classifier for Bearnoulli features
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
-#[derive(Debug, PartialEq)]
+#[derive(Debug)]
 struct BernoulliNBDistribution<T: RealNumber> {
     /// class labels known to the classifier
     class_labels: Vec<T>,
     class_priors: Vec<T>,
-    feature_prob: Vec<Vec<T>>,
+    feature_log_prob: Vec<Vec<T>>,
+}
+
+impl<T: RealNumber> PartialEq for BernoulliNBDistribution<T> {
+    fn eq(&self, other: &Self) -> bool {
+        if self.class_labels == other.class_labels && self.class_priors == other.class_priors {
+            self.feature_log_prob
+                .iter()
+                .zip(other.feature_log_prob.iter())
+                .any(|(left, right)| !left.approximate_eq(right, T::epsilon()))
+                .not()
+        } else {
+            false
+        }
+    }
 }
 
 impl<T: RealNumber, M: Matrix<T>> NBDistribution<T, M> for BernoulliNBDistribution<T> {
@@ -65,9 +81,9 @@ impl<T: RealNumber, M: Matrix<T>> NBDistribution<T, M> for BernoulliNBDistributi
         for feature in 0..j.len() {
             let value = j.get(feature);
             if value == T::one() {
-                likelihood += self.feature_prob[class_index][feature].ln();
+                likelihood += self.feature_log_prob[class_index][feature];
             } else {
-                likelihood += (T::one() - self.feature_prob[class_index][feature]).ln();
+                likelihood += (T::one() - self.feature_log_prob[class_index][feature].exp()).ln();
             }
         }
         likelihood
@@ -185,21 +201,23 @@ impl<T: RealNumber> BernoulliNBDistribution<T> {
             }
         }
 
-        let feature_prob = feature_in_class_counter
+        let feature_log_prob = feature_in_class_counter
             .iter()
             .enumerate()
             .map(|(class_index, feature_count)| {
                 feature_count
                     .iter()
-                    .map(|&count| (count + alpha) / (class_count[class_index] + alpha * T::two()))
+                    .map(|&count| {
+                        ((count + alpha) / (class_count[class_index] + alpha * T::two())).ln()
+                    })
                     .collect()
             })
             .collect();
 
         Ok(Self {
             class_labels,
             class_priors,
-            feature_prob,
+            feature_log_prob,
         })
     }
 }
@@ -272,6 +290,12 @@ impl<T: RealNumber, M: Matrix<T>> BernoulliNB<T, M> {
     pub fn classes(&self) -> &Vec<T> {
         &self.inner.distribution.class_labels
     }
+
+    /// Empirical log probability of features given a class, P(x_i|y).
+    /// Returns a 2d vector of shape (n_classes, n_features)
+    pub fn feature_log_prob(&self) -> &Vec<Vec<T>> {
+        &self.inner.distribution.feature_log_prob
+    }
 }
 
 #[cfg(test)]
@@ -302,10 +326,24 @@ mod tests {
 
         assert_eq!(bnb.inner.distribution.class_priors, &[0.75, 0.25]);
         assert_eq!(
-            bnb.inner.distribution.feature_prob,
+            bnb.feature_log_prob(),
             &[
-                &[0.4, 0.8, 0.2, 0.4, 0.4, 0.2],
-                &[1. / 3.0, 2. / 3.0, 2. / 3.0, 1. / 3.0, 1. / 3.0, 2. / 3.0]
+                &[
+                    -0.916290731874155,
+                    -0.2231435513142097,
+                    -1.6094379124341003,
+                    -0.916290731874155,
+                    -0.916290731874155,
+                    -1.6094379124341003
+                ],
+                &[
+                    -1.0986122886681098,
+                    -0.40546510810816444,
+                    -0.40546510810816444,
+                    -1.0986122886681098,
+                    -1.0986122886681098,
+                    -0.40546510810816444
+                ]
             ]
         );
 
@@ -348,10 +386,22 @@ mod tests {
             .distribution
             .class_priors
             .approximate_eq(&vec!(0.46, 0.2, 0.33), 1e-2));
-        assert!(bnb.inner.distribution.feature_prob[1].approximate_eq(
-            &vec!(0.8, 0.8, 0.8, 0.4, 0.8, 0.6, 0.8, 0.6, 0.6, 0.8),
+        assert!(bnb.feature_log_prob()[1].approximate_eq(
+            &vec![
+                -0.22314355,
+                -0.22314355,
+                -0.22314355,
+                -0.91629073,
+                -0.22314355,
+                -0.51082562,
+                -0.22314355,
+                -0.51082562,
+                -0.51082562,
+                -0.22314355
+            ],
             1e-1
         ));
+        println!("{:?}", y_hat);
         assert!(y_hat.approximate_eq(
             &vec!(2.0, 2.0, 0.0, 0.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
             1e-5