Skip to content

Commit

Permalink
Linear decision trees improvements (#60)
Browse files Browse the repository at this point in the history
* add debug trait to structs

* initial commit for random forest implementation

* WIP first iter of random forest

* add benches

* setup bench and cleanup

* cleanup

* add max_n_rows for single decision tree fitting

* implement random forest feature importance as collection of features from decision trees

* implement random forest feature importance as collection of features from decision trees

* remove unused var

* remove unused var

* run clippy

* assert test success for feature importance

* clippy and fmt

* store references of nodes to queue

* WIP voting classifier and predictor trait

* WIP voting classifier and predictor trait

* implement and test VotingClassifier hard voting

* implement predict_proba for random forest and tested

* documentation, examples, cleanup

* cleanup

* implement LinfaError for Predictor trait

* fixed tests and CI/CD pipeline

* renamed predict_classes to predict in logreg for consistency

* implement ProbabilisticPredictor whenever needed

* votingclassifier implements predictor trait

* PR-43 Moss comments addressed

* Switch `linfa-tree` to new infrastructure

* Experiment with interface

* Add argmax ensemble classifier

* Run fmt

* Add test with random noise

* Customize decision trees with weights

 * use weighting of dataset

* Remove unnecessary casting

* Compare weight in splits with hyperparams

* Rename _samples hyperparams

* Fix cargo fmt lint?

* Shush random forest example for time being

* Added new test for perfectly separable data

* Appease clippy

* Fix error in test

 * use midpoint
 * skip equal values until new value is encountered

* Run cargo fmt

* Add max_depth function for decision trees

* Add impurity decrease function

* Add mean impurity decrease

* Add more tests to linear decision trees

 * use toy test from sklearn
 * use four perfectly separable uniform blobs

* Remove number of classes hyper-parameter

This hyper-parameter can be estimated from the input data and is
therefore uneccessary in the API.

* Remove ensemble algorithm

* Address issue with toy test

* Simplify tree inspection methods

 * introduce node iterator
 * rewrite `max_depth`, `num_leaves`, `features` in iterator syntax

* Add max depth testing and hyperparameter validation

* Fix parameter syntax in benchmarks

* Run cargo fmt

* Add tikz export builder

* Improve decision tree formatting

* Add pruning

* Adjust syntax of tikz snippet

* Run cargo fmt

* Run cargo fmt

Co-authored-by: Francesco Gadaleta <francesco@amethix.com>
Co-authored-by: francesco <francesco.gadaleta@gmail.com>
Co-authored-by: moss <mossbanay@gmail.com>
  • Loading branch information
4 people authored Dec 6, 2020
1 parent a3eede5 commit bfa5aeb
Show file tree
Hide file tree
Showing 17 changed files with 941 additions and 413 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@ exclude = [".github/"]

[dependencies]
num-traits = "0.2"
rand = "0.7"
ndarray = { version = "0.13", default-features = false }

[dev-dependencies]
rand = "0.7"
ndarray-rand = "0.12"
rand_isaac = "0.2"
approx = "0.3"

[workspace]
Expand All @@ -37,7 +38,6 @@ members = [
"linfa-trees",
"linfa-svm",
"linfa-hierarchical",
"linfa-ica",
]

[profile.release]
Expand Down
12 changes: 6 additions & 6 deletions linfa-logistic/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ impl<F: Float, C: PartialOrd + Clone> FittedLogisticRegression<F, C> {

/// Given a feature matrix, predict the classes learned when the model was
/// fitted.
pub fn predict_classes<A: Data<Elem = F>>(&self, x: &ArrayBase<A, Ix2>) -> Vec<C> {
pub fn predict<A: Data<Elem = F>>(&self, x: &ArrayBase<A, Ix2>) -> Vec<C> {
let pos_class = class_from_label(&self.labels, F::POSITIVE_LABEL);
let neg_class = class_from_label(&self.labels, F::NEGATIVE_LABEL);
self.predict_probabilities(x)
Expand Down Expand Up @@ -647,7 +647,7 @@ mod test {
let res = log_reg.fit(&x, &y).unwrap();
assert_eq!(res.intercept(), 0.0);
assert!(res.params().abs_diff_eq(&array![0.681], 1e-3));
assert_eq!(res.predict_classes(&x), y.to_vec());
assert_eq!(res.predict(&x), y.to_vec());
}

#[test]
Expand All @@ -661,7 +661,7 @@ mod test {
assert!(res
.predict_probabilities(&x)
.abs_diff_eq(&array![0.501, 0.664, 0.335, 0.498], 1e-3));
assert_eq!(res.predict_classes(&x), y);
assert_eq!(res.predict(&x), y);
}

#[test]
Expand All @@ -683,7 +683,7 @@ mod test {
let res = log_reg.fit(&x, &y).unwrap();
assert!(res.intercept().abs_diff_eq(&-4.124, 1e-3));
assert!(res.params().abs_diff_eq(&array![1.181], 1e-3));
assert_eq!(res.predict_classes(&x), y.to_vec());
assert_eq!(res.predict(&x), y.to_vec());
}

#[test]
Expand Down Expand Up @@ -776,7 +776,7 @@ mod test {
let res = log_reg.fit(&x, &y).unwrap();
assert!(res.intercept().abs_diff_eq(&-4.124, 1e-3));
assert!(res.params().abs_diff_eq(&array![1.181], 1e-3));
assert_eq!(res.predict_classes(&x), y.to_vec());
assert_eq!(res.predict(&x), y.to_vec());
}

#[test]
Expand All @@ -787,6 +787,6 @@ mod test {
let res = log_reg.fit(&x, &y).unwrap();
assert_eq!(res.intercept(), 0.0 as f32);
assert!(res.params().abs_diff_eq(&array![0.682 as f32], 1e-3));
assert_eq!(res.predict_classes(&x), y.to_vec());
assert_eq!(res.predict(&x), y.to_vec());
}
}
1 change: 1 addition & 0 deletions linfa-svm/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ openblas-src = { version = "0.9", default-features = false, features = ["system"
csv = "1.1"
ndarray-csv = "0.4"
flate2 = "1.0"
rand_isaac = "0.2"
5 changes: 4 additions & 1 deletion linfa-svm/src/classification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,10 @@ mod tests {
use linfa_kernel::{Kernel, KernelMethod};

use ndarray::{Array, Array2, Axis};
use ndarray_rand::rand::SeedableRng;
use ndarray_rand::rand_distr::Uniform;
use ndarray_rand::RandomExt;
use rand_isaac::Isaac64Rng;

pub fn generate_convoluted_rings(n_points: usize) -> Array2<f64> {
let mut out = Array::random((n_points * 2, 2), Uniform::new(0f64, 1.));
Expand Down Expand Up @@ -374,8 +376,9 @@ mod tests {

#[test]
fn test_polynomial_classification() {
let mut rng = Isaac64Rng::seed_from_u64(42);
// construct parabolica and classify middle area as positive and borders as negative
let records = Array::random((40, 1), Uniform::new(-2f64, 2.));
let records = Array::random_using((40, 1), Uniform::new(-2f64, 2.), &mut rng);
let targets = records.map_axis(Axis(1), |x| x[0] * x[0] < 0.5).to_vec();
let dataset = Dataset::new(records.clone(), targets);

Expand Down
5 changes: 2 additions & 3 deletions linfa-trees/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,12 @@ categories = ["algorithms", "mathematics", "science"]
ndarray = { version = "0.13" , features = ["rayon", "approx"]}
ndarray-rand = "0.11"

linfa = { path = ".." }

[dev-dependencies]
rand_isaac = "0.2.0"
ndarray-npy = { version = "0.5", default-features = false }
criterion = "0.3"
serde_json = "1"
approx = "0.3"
#linfa-clustering = { version = "0.2.1", path = "../linfa-clustering" }

[[bench]]
name = "decision_tree"
Expand Down
42 changes: 24 additions & 18 deletions linfa-trees/benches/decision_tree.rs
Original file line number Diff line number Diff line change
@@ -1,48 +1,54 @@
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use linfa_clustering::generate_blobs;
use linfa_trees::{DecisionTree, DecisionTreeParams};
use ndarray::{Array, Array2};
use linfa::prelude::*;
use linfa_trees::DecisionTree;
use ndarray::{stack, Array, Array2, Axis};
use ndarray_rand::rand::SeedableRng;
use ndarray_rand::rand_distr::Uniform;
use ndarray_rand::rand_distr::{StandardNormal, Uniform};
use ndarray_rand::RandomExt;
use rand_isaac::Isaac64Rng;
use std::iter::FromIterator;

fn generate_blobs(means: &Array2<f64>, samples: usize, mut rng: &mut Isaac64Rng) -> Array2<f64> {
let out = means
.axis_iter(Axis(0))
.map(|mean| Array::random_using((samples, 4), StandardNormal, &mut rng) + mean)
.collect::<Vec<_>>();
let out2 = out.iter().map(|x| x.view()).collect::<Vec<_>>();

stack(Axis(0), &out2).unwrap()
}

fn decision_tree_bench(c: &mut Criterion) {
let mut rng = Isaac64Rng::seed_from_u64(42);

// Controls how many samples for each class are generated
let training_set_sizes = vec![100, 1000, 10000, 100000];

let n_classes: u64 = 4;
let n_classes = 4;
let n_features = 4;

// Use the default configuration
let hyperparams = DecisionTreeParams::new(n_classes as u64);
let hyperparams = DecisionTree::params();

// Benchmark training time 10 times for each training sample size
let mut group = c.benchmark_group("decision_tree");
group.sample_size(10);

for n in training_set_sizes.iter() {
let centroids = Array2::random_using(
(n_classes as usize, n_features),
Uniform::new(-30., 30.),
&mut rng,
);
let centroids =
Array2::random_using((n_classes, n_features), Uniform::new(-30., 30.), &mut rng);

let train_x = generate_blobs(*n, &centroids, &mut rng);
let train_x = generate_blobs(&centroids, *n, &mut rng);
let train_y = Array::from_iter(
(0..n_classes)
.map(|x| std::iter::repeat(x).take(*n).collect::<Vec<u64>>())
.map(|x| std::iter::repeat(x).take(*n).collect::<Vec<usize>>())
.flatten(),
);
let dataset = Dataset::new(train_x, train_y);

group.bench_with_input(
BenchmarkId::from_parameter(n),
&(train_x, train_y),
|b, (x, y)| b.iter(|| DecisionTree::fit(hyperparams.build(), &x, &y)),
);
group.bench_with_input(BenchmarkId::from_parameter(n), &dataset, |b, d| {
b.iter(|| hyperparams.fit(&d))
});
}

group.finish();
Expand Down
137 changes: 51 additions & 86 deletions linfa-trees/examples/decision_tree.rs
Original file line number Diff line number Diff line change
@@ -1,122 +1,87 @@
use linfa_trees::{DecisionTree, DecisionTreeParams, SplitQuality};
use ndarray::{array, s, Array, Array2, ArrayBase, Data, Ix1, Ix2};
use ndarray_rand::rand::Rng;
use std::fs::File;
use std::io::Write;

use ndarray::{array, stack, Array, Array1, Array2, Axis};
use ndarray_rand::rand::SeedableRng;
use ndarray_rand::rand_distr::StandardNormal;
use ndarray_rand::RandomExt;
use rand_isaac::Isaac64Rng;
use std::iter::FromIterator;

/// Given an input matrix `blob_centroids`, with shape `(n_blobs, n_features)`,
/// generate `blob_size` data points (a "blob") around each of the blob centroids.
///
/// More specifically, each blob is formed by `blob_size` points sampled from a normal
/// distribution centered in the blob centroid with unit variance.
///
/// `generate_blobs` can be used to quickly assemble a synthetic dataset to test or
/// benchmark various clustering algorithms on a best-case scenario input.
pub fn generate_blobs(
blob_size: usize,
blob_centroids: &ArrayBase<impl Data<Elem = f64>, Ix2>,
rng: &mut impl Rng,
) -> Array2<f64> {
let (n_centroids, n_features) = blob_centroids.dim();
let mut blobs: Array2<f64> = Array2::zeros((n_centroids * blob_size, n_features));

for (blob_index, blob_centroid) in blob_centroids.genrows().into_iter().enumerate() {
let blob = generate_blob(blob_size, &blob_centroid, rng);

let indexes = s![blob_index * blob_size..(blob_index + 1) * blob_size, ..];
blobs.slice_mut(indexes).assign(&blob);
}
blobs
}

/// Generate `blob_size` data points (a "blob") around `blob_centroid`.
///
/// More specifically, the blob is formed by `blob_size` points sampled from a normal
/// distribution centered in `blob_centroid` with unit variance.
///
/// `generate_blob` can be used to quickly assemble a synthetic stereotypical cluster.
pub fn generate_blob(
blob_size: usize,
blob_centroid: &ArrayBase<impl Data<Elem = f64>, Ix1>,
rng: &mut impl Rng,
) -> Array2<f64> {
let shape = (blob_size, blob_centroid.len());
let origin_blob: Array2<f64> = Array::random_using(shape, StandardNormal, rng);
origin_blob + blob_centroid
}
use linfa::prelude::*;
use linfa_trees::{DecisionTree, SplitQuality};

fn accuracy(
labels: &ArrayBase<impl Data<Elem = u64>, Ix1>,
pred: &ArrayBase<impl Data<Elem = u64>, Ix1>,
) -> f64 {
let true_positive: f64 = labels
.iter()
.zip(pred.iter())
.filter(|(x, y)| x == y)
.map(|_| 1.0)
.sum();
true_positive / labels.len() as f64
fn generate_blobs(means: &[(f64, f64)], samples: usize, mut rng: &mut Isaac64Rng) -> Array2<f64> {
let out = means
.into_iter()
.map(|mean| {
Array::random_using((samples, 2), StandardNormal, &mut rng) + array![mean.0, mean.1]
})
.collect::<Vec<_>>();
let out2 = out.iter().map(|x| x.view()).collect::<Vec<_>>();

stack(Axis(0), &out2).unwrap()
}

fn main() {
// Our random number generator, seeded for reproducibility
let mut rng = Isaac64Rng::seed_from_u64(42);

// For each our expected centroids, generate `n` data points around it (a "blob")
let n_classes: u64 = 4;
let expected_centroids = array![[0., 0.], [1., 4.], [-5., 0.], [4., 4.]];
let n = 100;
let n_classes: usize = 4;
let n = 300;

println!("Generating training data");

let train_x = generate_blobs(n, &expected_centroids, &mut rng);
let train_y = Array::from_iter(
(0..n_classes)
.map(|x| std::iter::repeat(x).take(n).collect::<Vec<u64>>())
.flatten(),
);
let train_x = generate_blobs(&[(0., 0.), (1., 4.), (-5., 0.), (4., 4.)], n, &mut rng);
let train_y = (0..n_classes)
.map(|x| std::iter::repeat(x).take(n).collect::<Vec<_>>())
.flatten()
.collect::<Array1<_>>();

let test_x = generate_blobs(n, &expected_centroids, &mut rng);
let test_y = Array::from_iter(
(0..n_classes)
.map(|x| std::iter::repeat(x).take(n).collect::<Vec<u64>>())
.flatten(),
);

println!("Generated training data");
let dataset = Dataset::new(train_x, train_y).shuffle(&mut rng);
let (train, test) = dataset.split_with_ratio(0.9);

println!("Training model with Gini criterion ...");
let gini_hyperparams = DecisionTreeParams::new(n_classes)
let gini_model = DecisionTree::params()
.split_quality(SplitQuality::Gini)
.max_depth(Some(100))
.min_samples_split(10)
.min_samples_leaf(10)
.build();
.min_weight_split(10.0)
.min_weight_leaf(10.0)
.fit(&train);

let gini_pred_y = gini_model.predict(test.records().view());
let cm = gini_pred_y.confusion_matrix(&test);

let gini_model = DecisionTree::fit(gini_hyperparams, &train_x, &train_y);
println!("{:?}", cm);

let gini_pred_y = gini_model.predict(&test_x);
println!(
"Test accuracy with Gini criterion: {:.2}%",
100.0 * accuracy(&test_y, &gini_pred_y)
100.0 * cm.accuracy()
);

println!("Training model with entropy criterion ...");
let entropy_hyperparams = DecisionTreeParams::new(n_classes)
let entropy_model = DecisionTree::params()
.split_quality(SplitQuality::Entropy)
.max_depth(Some(100))
.min_samples_split(10)
.min_samples_leaf(10)
.build();
.min_weight_split(10.0)
.min_weight_leaf(10.0)
.fit(&train);

let entropy_pred_y = gini_model.predict(test.records().view());
let cm = entropy_pred_y.confusion_matrix(&test);

let entropy_model = DecisionTree::fit(entropy_hyperparams, &train_x, &train_y);
println!("{:?}", cm);

let entropy_pred_y = entropy_model.predict(&test_x);
println!(
"Test accuracy with Entropy criterion: {:.2}%",
100.0 * accuracy(&test_y, &entropy_pred_y)
100.0 * cm.accuracy()
);

let feats = entropy_model.features();
println!("Features trained in this tree {:?}", feats);

let mut tikz = File::create("decision_tree_example.tex").unwrap();
tikz.write(gini_model.export_to_tikz().to_string().as_bytes())
.unwrap();
println!(" => generate tree description with `latex decision_tree_example.tex`!");
}
Loading

0 comments on commit bfa5aeb

Please sign in to comment.