Linear decision trees improvements (#60)

* add debug trait to structs * initial commit for random forest implementation * WIP first iter of random forest * add benches * setup bench and cleanup * cleanup * add max_n_rows for single decision tree fitting * implement random forest feature importance as collection of features from decision trees * implement random forest feature importance as collection of features from decision trees * remove unused var * remove unused var * run clippy * assert test success for feature importance * clippy and fmt * store references of nodes to queue * WIP voting classifier and predictor trait * WIP voting classifier and predictor trait * implement and test VotingClassifier hard voting * implement predict_proba for random forest and tested * documentation, examples, cleanup * cleanup * implement LinfaError for Predictor trait * fixed tests and CI/CD pipeline * renamed predict_classes to predict in logreg for consistency * implement ProbabilisticPredictor whenever needed * votingclassifier implements predictor trait * PR-43 Moss comments addressed * Switch `linfa-tree` to new infrastructure * Experiment with interface * Add argmax ensemble classifier * Run fmt * Add test with random noise * Customize decision trees with weights * use weighting of dataset * Remove unnecessary casting * Compare weight in splits with hyperparams * Rename _samples hyperparams * Fix cargo fmt lint? * Shush random forest example for time being * Added new test for perfectly separable data * Appease clippy * Fix error in test * use midpoint * skip equal values until new value is encountered * Run cargo fmt * Add max_depth function for decision trees * Add impurity decrease function * Add mean impurity decrease * Add more tests to linear decision trees * use toy test from sklearn * use four perfectly separable uniform blobs * Remove number of classes hyper-parameter This hyper-parameter can be estimated from the input data and is therefore uneccessary in the API. * Remove ensemble algorithm * Address issue with toy test * Simplify tree inspection methods * introduce node iterator * rewrite `max_depth`, `num_leaves`, `features` in iterator syntax * Add max depth testing and hyperparameter validation * Fix parameter syntax in benchmarks * Run cargo fmt * Add tikz export builder * Improve decision tree formatting * Add pruning * Adjust syntax of tikz snippet * Run cargo fmt * Run cargo fmt Co-authored-by: Francesco Gadaleta <francesco@amethix.com> Co-authored-by: francesco <francesco.gadaleta@gmail.com> Co-authored-by: moss <mossbanay@gmail.com>
rust-ml · Dec 6, 2020 · bfa5aeb · bfa5aeb
1 parent a3eede5
commit bfa5aeb
Show file tree

Hide file tree

Showing 17 changed files with 941 additions and 413 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,11 +20,12 @@ exclude = [".github/"]
 
 [dependencies]
 num-traits = "0.2"
+rand = "0.7"
 ndarray = { version = "0.13", default-features = false }
 
 [dev-dependencies]
-rand = "0.7"
 ndarray-rand = "0.12"
+rand_isaac = "0.2"
 approx = "0.3"
 
 [workspace]
@@ -37,7 +38,6 @@ members = [
     "linfa-trees",
     "linfa-svm",
     "linfa-hierarchical",
-    "linfa-ica",
 ]
 
 [profile.release]

diff --git a/linfa-logistic/src/lib.rs b/linfa-logistic/src/lib.rs
@@ -437,7 +437,7 @@ impl<F: Float, C: PartialOrd + Clone> FittedLogisticRegression<F, C> {
 
     /// Given a feature matrix, predict the classes learned when the model was
     /// fitted.
-    pub fn predict_classes<A: Data<Elem = F>>(&self, x: &ArrayBase<A, Ix2>) -> Vec<C> {
+    pub fn predict<A: Data<Elem = F>>(&self, x: &ArrayBase<A, Ix2>) -> Vec<C> {
         let pos_class = class_from_label(&self.labels, F::POSITIVE_LABEL);
         let neg_class = class_from_label(&self.labels, F::NEGATIVE_LABEL);
         self.predict_probabilities(x)
@@ -647,7 +647,7 @@ mod test {
         let res = log_reg.fit(&x, &y).unwrap();
         assert_eq!(res.intercept(), 0.0);
         assert!(res.params().abs_diff_eq(&array![0.681], 1e-3));
-        assert_eq!(res.predict_classes(&x), y.to_vec());
+        assert_eq!(res.predict(&x), y.to_vec());
     }
 
     #[test]
@@ -661,7 +661,7 @@ mod test {
         assert!(res
             .predict_probabilities(&x)
             .abs_diff_eq(&array![0.501, 0.664, 0.335, 0.498], 1e-3));
-        assert_eq!(res.predict_classes(&x), y);
+        assert_eq!(res.predict(&x), y);
     }
 
     #[test]
@@ -683,7 +683,7 @@ mod test {
         let res = log_reg.fit(&x, &y).unwrap();
         assert!(res.intercept().abs_diff_eq(&-4.124, 1e-3));
         assert!(res.params().abs_diff_eq(&array![1.181], 1e-3));
-        assert_eq!(res.predict_classes(&x), y.to_vec());
+        assert_eq!(res.predict(&x), y.to_vec());
     }
 
     #[test]
@@ -776,7 +776,7 @@ mod test {
         let res = log_reg.fit(&x, &y).unwrap();
         assert!(res.intercept().abs_diff_eq(&-4.124, 1e-3));
         assert!(res.params().abs_diff_eq(&array![1.181], 1e-3));
-        assert_eq!(res.predict_classes(&x), y.to_vec());
+        assert_eq!(res.predict(&x), y.to_vec());
     }
 
     #[test]
@@ -787,6 +787,6 @@ mod test {
         let res = log_reg.fit(&x, &y).unwrap();
         assert_eq!(res.intercept(), 0.0 as f32);
         assert!(res.params().abs_diff_eq(&array![0.682 as f32], 1e-3));
-        assert_eq!(res.predict_classes(&x), y.to_vec());
+        assert_eq!(res.predict(&x), y.to_vec());
     }
 }
diff --git a/linfa-svm/Cargo.toml b/linfa-svm/Cargo.toml
@@ -25,3 +25,4 @@ openblas-src = { version = "0.9", default-features = false, features = ["system"
 csv = "1.1"
 ndarray-csv = "0.4"
 flate2 = "1.0"
+rand_isaac = "0.2"
diff --git a/linfa-svm/src/classification.rs b/linfa-svm/src/classification.rs
@@ -314,8 +314,10 @@ mod tests {
     use linfa_kernel::{Kernel, KernelMethod};
 
     use ndarray::{Array, Array2, Axis};
+    use ndarray_rand::rand::SeedableRng;
     use ndarray_rand::rand_distr::Uniform;
     use ndarray_rand::RandomExt;
+    use rand_isaac::Isaac64Rng;
 
     pub fn generate_convoluted_rings(n_points: usize) -> Array2<f64> {
         let mut out = Array::random((n_points * 2, 2), Uniform::new(0f64, 1.));
@@ -374,8 +376,9 @@ mod tests {
 
     #[test]
     fn test_polynomial_classification() {
+        let mut rng = Isaac64Rng::seed_from_u64(42);
         // construct parabolica and classify middle area as positive and borders as negative
-        let records = Array::random((40, 1), Uniform::new(-2f64, 2.));
+        let records = Array::random_using((40, 1), Uniform::new(-2f64, 2.), &mut rng);
         let targets = records.map_axis(Axis(1), |x| x[0] * x[0] < 0.5).to_vec();
         let dataset = Dataset::new(records.clone(), targets);
 

diff --git a/linfa-trees/Cargo.toml b/linfa-trees/Cargo.toml
@@ -16,13 +16,12 @@ categories = ["algorithms", "mathematics", "science"]
 ndarray = { version = "0.13" , features = ["rayon", "approx"]}
 ndarray-rand = "0.11"
 
+linfa = { path = ".." }
+
 [dev-dependencies]
 rand_isaac = "0.2.0"
-ndarray-npy = { version = "0.5", default-features = false }
 criterion = "0.3"
-serde_json = "1"
 approx = "0.3"
-#linfa-clustering = { version = "0.2.1", path = "../linfa-clustering" }
 
 [[bench]]
 name = "decision_tree"

diff --git a/linfa-trees/benches/decision_tree.rs b/linfa-trees/benches/decision_tree.rs
@@ -1,48 +1,54 @@
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
-use linfa_clustering::generate_blobs;
-use linfa_trees::{DecisionTree, DecisionTreeParams};
-use ndarray::{Array, Array2};
+use linfa::prelude::*;
+use linfa_trees::DecisionTree;
+use ndarray::{stack, Array, Array2, Axis};
 use ndarray_rand::rand::SeedableRng;
-use ndarray_rand::rand_distr::Uniform;
+use ndarray_rand::rand_distr::{StandardNormal, Uniform};
 use ndarray_rand::RandomExt;
 use rand_isaac::Isaac64Rng;
 use std::iter::FromIterator;
 
+fn generate_blobs(means: &Array2<f64>, samples: usize, mut rng: &mut Isaac64Rng) -> Array2<f64> {
+    let out = means
+        .axis_iter(Axis(0))
+        .map(|mean| Array::random_using((samples, 4), StandardNormal, &mut rng) + mean)
+        .collect::<Vec<_>>();
+    let out2 = out.iter().map(|x| x.view()).collect::<Vec<_>>();
+
+    stack(Axis(0), &out2).unwrap()
+}
+
 fn decision_tree_bench(c: &mut Criterion) {
     let mut rng = Isaac64Rng::seed_from_u64(42);
 
     // Controls how many samples for each class are generated
     let training_set_sizes = vec![100, 1000, 10000, 100000];
 
-    let n_classes: u64 = 4;
+    let n_classes = 4;
     let n_features = 4;
 
     // Use the default configuration
-    let hyperparams = DecisionTreeParams::new(n_classes as u64);
+    let hyperparams = DecisionTree::params();
 
     // Benchmark training time 10 times for each training sample size
     let mut group = c.benchmark_group("decision_tree");
     group.sample_size(10);
 
     for n in training_set_sizes.iter() {
-        let centroids = Array2::random_using(
-            (n_classes as usize, n_features),
-            Uniform::new(-30., 30.),
-            &mut rng,
-        );
+        let centroids =
+            Array2::random_using((n_classes, n_features), Uniform::new(-30., 30.), &mut rng);
 
-        let train_x = generate_blobs(*n, &centroids, &mut rng);
+        let train_x = generate_blobs(&centroids, *n, &mut rng);
         let train_y = Array::from_iter(
             (0..n_classes)
-                .map(|x| std::iter::repeat(x).take(*n).collect::<Vec<u64>>())
+                .map(|x| std::iter::repeat(x).take(*n).collect::<Vec<usize>>())
                 .flatten(),
         );
+        let dataset = Dataset::new(train_x, train_y);
 
-        group.bench_with_input(
-            BenchmarkId::from_parameter(n),
-            &(train_x, train_y),
-            |b, (x, y)| b.iter(|| DecisionTree::fit(hyperparams.build(), &x, &y)),
-        );
+        group.bench_with_input(BenchmarkId::from_parameter(n), &dataset, |b, d| {
+            b.iter(|| hyperparams.fit(&d))
+        });
     }
 
     group.finish();

diff --git a/linfa-trees/examples/decision_tree.rs b/linfa-trees/examples/decision_tree.rs
@@ -1,122 +1,87 @@
-use linfa_trees::{DecisionTree, DecisionTreeParams, SplitQuality};
-use ndarray::{array, s, Array, Array2, ArrayBase, Data, Ix1, Ix2};
-use ndarray_rand::rand::Rng;
+use std::fs::File;
+use std::io::Write;
+
+use ndarray::{array, stack, Array, Array1, Array2, Axis};
 use ndarray_rand::rand::SeedableRng;
 use ndarray_rand::rand_distr::StandardNormal;
 use ndarray_rand::RandomExt;
 use rand_isaac::Isaac64Rng;
-use std::iter::FromIterator;
-
-/// Given an input matrix `blob_centroids`, with shape `(n_blobs, n_features)`,
-/// generate `blob_size` data points (a "blob") around each of the blob centroids.
-///
-/// More specifically, each blob is formed by `blob_size` points sampled from a normal
-/// distribution centered in the blob centroid with unit variance.
-///
-/// `generate_blobs` can be used to quickly assemble a synthetic dataset to test or
-/// benchmark various clustering algorithms on a best-case scenario input.
-pub fn generate_blobs(
-    blob_size: usize,
-    blob_centroids: &ArrayBase<impl Data<Elem = f64>, Ix2>,
-    rng: &mut impl Rng,
-) -> Array2<f64> {
-    let (n_centroids, n_features) = blob_centroids.dim();
-    let mut blobs: Array2<f64> = Array2::zeros((n_centroids * blob_size, n_features));
-
-    for (blob_index, blob_centroid) in blob_centroids.genrows().into_iter().enumerate() {
-        let blob = generate_blob(blob_size, &blob_centroid, rng);
-
-        let indexes = s![blob_index * blob_size..(blob_index + 1) * blob_size, ..];
-        blobs.slice_mut(indexes).assign(&blob);
-    }
-    blobs
-}
 
-/// Generate `blob_size` data points (a "blob") around `blob_centroid`.
-///
-/// More specifically, the blob is formed by `blob_size` points sampled from a normal
-/// distribution centered in `blob_centroid` with unit variance.
-///
-/// `generate_blob` can be used to quickly assemble a synthetic stereotypical cluster.
-pub fn generate_blob(
-    blob_size: usize,
-    blob_centroid: &ArrayBase<impl Data<Elem = f64>, Ix1>,
-    rng: &mut impl Rng,
-) -> Array2<f64> {
-    let shape = (blob_size, blob_centroid.len());
-    let origin_blob: Array2<f64> = Array::random_using(shape, StandardNormal, rng);
-    origin_blob + blob_centroid
-}
+use linfa::prelude::*;
+use linfa_trees::{DecisionTree, SplitQuality};
 
-fn accuracy(
-    labels: &ArrayBase<impl Data<Elem = u64>, Ix1>,
-    pred: &ArrayBase<impl Data<Elem = u64>, Ix1>,
-) -> f64 {
-    let true_positive: f64 = labels
-        .iter()
-        .zip(pred.iter())
-        .filter(|(x, y)| x == y)
-        .map(|_| 1.0)
-        .sum();
-    true_positive / labels.len() as f64
+fn generate_blobs(means: &[(f64, f64)], samples: usize, mut rng: &mut Isaac64Rng) -> Array2<f64> {
+    let out = means
+        .into_iter()
+        .map(|mean| {
+            Array::random_using((samples, 2), StandardNormal, &mut rng) + array![mean.0, mean.1]
+        })
+        .collect::<Vec<_>>();
+    let out2 = out.iter().map(|x| x.view()).collect::<Vec<_>>();
+
+    stack(Axis(0), &out2).unwrap()
 }
 
 fn main() {
     // Our random number generator, seeded for reproducibility
     let mut rng = Isaac64Rng::seed_from_u64(42);
 
     // For each our expected centroids, generate `n` data points around it (a "blob")
-    let n_classes: u64 = 4;
-    let expected_centroids = array![[0., 0.], [1., 4.], [-5., 0.], [4., 4.]];
-    let n = 100;
+    let n_classes: usize = 4;
+    let n = 300;
 
     println!("Generating training data");
 
-    let train_x = generate_blobs(n, &expected_centroids, &mut rng);
-    let train_y = Array::from_iter(
-        (0..n_classes)
-            .map(|x| std::iter::repeat(x).take(n).collect::<Vec<u64>>())
-            .flatten(),
-    );
+    let train_x = generate_blobs(&[(0., 0.), (1., 4.), (-5., 0.), (4., 4.)], n, &mut rng);
+    let train_y = (0..n_classes)
+        .map(|x| std::iter::repeat(x).take(n).collect::<Vec<_>>())
+        .flatten()
+        .collect::<Array1<_>>();
 
-    let test_x = generate_blobs(n, &expected_centroids, &mut rng);
-    let test_y = Array::from_iter(
-        (0..n_classes)
-            .map(|x| std::iter::repeat(x).take(n).collect::<Vec<u64>>())
-            .flatten(),
-    );
-
-    println!("Generated training data");
+    let dataset = Dataset::new(train_x, train_y).shuffle(&mut rng);
+    let (train, test) = dataset.split_with_ratio(0.9);
 
     println!("Training model with Gini criterion ...");
-    let gini_hyperparams = DecisionTreeParams::new(n_classes)
+    let gini_model = DecisionTree::params()
         .split_quality(SplitQuality::Gini)
         .max_depth(Some(100))
-        .min_samples_split(10)
-        .min_samples_leaf(10)
-        .build();
+        .min_weight_split(10.0)
+        .min_weight_leaf(10.0)
+        .fit(&train);
+
+    let gini_pred_y = gini_model.predict(test.records().view());
+    let cm = gini_pred_y.confusion_matrix(&test);
 
-    let gini_model = DecisionTree::fit(gini_hyperparams, &train_x, &train_y);
+    println!("{:?}", cm);
 
-    let gini_pred_y = gini_model.predict(&test_x);
     println!(
         "Test accuracy with Gini criterion: {:.2}%",
-        100.0 * accuracy(&test_y, &gini_pred_y)
+        100.0 * cm.accuracy()
     );
 
     println!("Training model with entropy criterion ...");
-    let entropy_hyperparams = DecisionTreeParams::new(n_classes)
+    let entropy_model = DecisionTree::params()
         .split_quality(SplitQuality::Entropy)
         .max_depth(Some(100))
-        .min_samples_split(10)
-        .min_samples_leaf(10)
-        .build();
+        .min_weight_split(10.0)
+        .min_weight_leaf(10.0)
+        .fit(&train);
+
+    let entropy_pred_y = gini_model.predict(test.records().view());
+    let cm = entropy_pred_y.confusion_matrix(&test);
 
-    let entropy_model = DecisionTree::fit(entropy_hyperparams, &train_x, &train_y);
+    println!("{:?}", cm);
 
-    let entropy_pred_y = entropy_model.predict(&test_x);
     println!(
         "Test accuracy with Entropy criterion: {:.2}%",
-        100.0 * accuracy(&test_y, &entropy_pred_y)
+        100.0 * cm.accuracy()
     );
+
+    let feats = entropy_model.features();
+    println!("Features trained in this tree {:?}", feats);
+
+    let mut tikz = File::create("decision_tree_example.tex").unwrap();
+    tikz.write(gini_model.export_to_tikz().to_string().as_bytes())
+        .unwrap();
+    println!(" => generate tree description with `latex decision_tree_example.tex`!");
 }