Skip to content

Commit

Permalink
Merge branch 'master' into 0.11.0dev0
Browse files Browse the repository at this point in the history
  • Loading branch information
rodrigo-arenas authored Sep 12, 2024
2 parents fba486c + 1e9740b commit a19c6d6
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 11 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,7 @@ dmypy.json
.pyre/

#Pycharm
.idea
.idea

#VSCode
.vscode
25 changes: 25 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

.. |PythonVersion| image:: https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11-blue
.. _PythonVersion : https://www.python.org/downloads/

.. |PyPi| image:: https://badge.fury.io/py/sklearn-genetic-opt.svg
.. _PyPi: https://badge.fury.io/py/sklearn-genetic-opt

Expand All @@ -30,6 +31,29 @@ scikit-learn models hyperparameters tuning and feature selection, using evolutio
This is meant to be an alternative to popular methods inside scikit-learn such as Grid Search and Randomized Grid Search
for hyperparameters tuning, and from RFE (Recursive Feature Elimination), Select From Model for feature selection.

**Table of Contents**
######################

- Sklearn-genetic-opt Overview
- Main Features
- Demos on Features
- Installation
- Basic Installation
- Full Installation with Extras
- Usage
- Hyperparameters Tuning
- Feature Selection
- Documentation
- Stable
- Latest
- Development
- Changelog
- Important Links
- Source Code
- Contributing
- Testing


Sklearn-genetic-opt uses evolutionary algorithms from the `DEAP <https://deap.readthedocs.io/en/master/>`_ (Distributed Evolutionary Algorithms in Python) package to choose the set of hyperparameters that
optimizes (max or min) the cross-validation scores, it can be used for both regression and classification problems.

Expand Down Expand Up @@ -87,6 +111,7 @@ Example: Hyperparameters Tuning

.. code-block:: python
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Continuous, Categorical, Integer
from sklearn.ensemble import RandomForestClassifier
Expand Down
2 changes: 1 addition & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ twine==3.3.0
seaborn>=0.11.2
mlflow>=2.1.0
tcl==0.2
black==23.1.0
black==24.3.0
sphinx
sphinx_gallery
sphinx_rtd_theme
Expand Down
27 changes: 18 additions & 9 deletions sklearn_genetic/genetic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def __init__(
self.pre_dispatch = pre_dispatch
self.error_score = error_score
self.return_train_score = return_train_score
self.creator = creator
# self.creator = creator
self.log_config = log_config
self.use_cache = use_cache
self.fitness_cache = {}
Expand Down Expand Up @@ -311,6 +311,7 @@ def _register(self):
self.creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign, 1.0])
self.creator.create("Individual", list, fitness=creator.FitnessMax)


attributes = []
# Assign all the parameters defined in the param_grid
# It uses the distribution parameter to set the sampling function
Expand Down Expand Up @@ -357,6 +358,7 @@ def _register(self):
self._stats.register("fitness_max", np.max, axis=0)
self._stats.register("fitness_min", np.min, axis=0)


self.logbook = tools.Logbook()

def _initialize_population(self):
Expand Down Expand Up @@ -594,8 +596,8 @@ def fit(self, X, y, callbacks=None):
for k in range(len(self._hof))
}

del self.creator.FitnessMax
del self.creator.Individual
del creator.FitnessMax
del creator.Individual

return self

Expand Down Expand Up @@ -937,7 +939,7 @@ def __init__(
self.pre_dispatch = pre_dispatch
self.error_score = error_score
self.return_train_score = return_train_score
self.creator = creator
# self.creator = creator
self.log_config = log_config
self.use_cache = use_cache
self.fitness_cache = {}
Expand All @@ -963,8 +965,8 @@ def _register(self):

# Criteria sign to set max or min problem
# And -1.0 as second weight to minimize number of features
self.creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign, -1.0])
self.creator.create("Individual", list, fitness=creator.FitnessMax)
creator.create("FitnessMax", base.Fitness, weights=[self.criteria_sign, -1.0])
creator.create("Individual", list, fitness=creator.FitnessMax)

# Register the array to choose the features
# Each binary value represents if the feature is selected or not
Expand Down Expand Up @@ -994,7 +996,7 @@ def _register(self):

# Stats among axis 0 to get two values:
# One based on the score and the other in the number of features
self._stats = tools.Statistics(lambda ind: ind.fitness.values)
self._stats = tools.Statistics(ind_fitness_values)
self._stats.register("fitness", np.mean, axis=0)
self._stats.register("fitness_std", np.std, axis=0)
self._stats.register("fitness_max", np.max, axis=0)
Expand Down Expand Up @@ -1185,8 +1187,8 @@ def fit(self, X, y, callbacks=None):

self.hof = self._hof

del self.creator.FitnessMax
del self.creator.Individual
del creator.FitnessMax
del creator.Individual

return self

Expand Down Expand Up @@ -1446,3 +1448,10 @@ def score(self, X, y):
``best_estimator_.score`` method otherwise.
"""
return self.estimator.score(self.transform(X), y)


# helpers


def ind_fitness_values(ind):
return ind.fitness.values
105 changes: 105 additions & 0 deletions sklearn_genetic/tests/test_serialization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import pytest
from sklearn.datasets import load_iris, load_diabetes
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import make_scorer
import numpy as np

from .. import GAFeatureSelectionCV
from ..callbacks import (
ThresholdStopping,
DeltaThreshold,
ConsecutiveStopping,
TimerStopping,
ProgressBar,
)
from ..schedules import ExponentialAdapter, InverseAdapter
from joblib import dump, load
import os


data = load_iris()
label_names = data["target_names"]
y = data["target"]
X = data["data"]

noise = np.random.uniform(1, 4, size=(X.shape[0], 10))

X = np.hstack((X, noise))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


def test_estimator_serialization():
clf = SGDClassifier(loss="modified_huber", fit_intercept=True)
generations = 6
evolved_estimator = GAFeatureSelectionCV(
clf,
cv=3,
scoring="accuracy",
population_size=6,
generations=generations,
tournament_size=3,
elitism=False,
keep_top_k=4,
verbose=False,
algorithm="eaSimple",
n_jobs=-1,
return_train_score=True,
)

evolved_estimator.fit(X_train, y_train)
dump_file = "evolved_estimator.pkl"

# test dump
assert dump(evolved_estimator, dump_file)[0] == dump_file

# load
dumped_estimator = load(dump_file)
features = dumped_estimator.support_

assert check_is_fitted(dumped_estimator) is None
assert features.shape[0] == X.shape[1]
assert len(dumped_estimator) == generations + 1 # +1 random initial population
assert len(dumped_estimator.predict(X_test)) == len(X_test)
assert dumped_estimator.score(X_train, y_train) >= 0
assert len(dumped_estimator.decision_function(X_test)) == len(X_test)
assert len(dumped_estimator.predict_proba(X_test)) == len(X_test)
assert len(dumped_estimator.predict_log_proba(X_test)) == len(X_test)
assert dumped_estimator.score(X_test, y_test) == accuracy_score(
y_test, dumped_estimator.predict(X_test)
)
assert bool(dumped_estimator.get_params())
assert len(dumped_estimator.hof) == dumped_estimator.keep_top_k
assert "gen" in dumped_estimator[0]
assert "fitness_max" in dumped_estimator[0]
assert "fitness" in dumped_estimator[0]
assert "fitness_std" in dumped_estimator[0]
assert "fitness_min" in dumped_estimator[0]

cv_results_ = dumped_estimator.cv_results_
cv_result_keys = set(cv_results_.keys())

assert "split0_test_score" in cv_result_keys
assert "split1_test_score" in cv_result_keys
assert "split2_test_score" in cv_result_keys
assert "split0_train_score" in cv_result_keys
assert "split1_train_score" in cv_result_keys
assert "split2_train_score" in cv_result_keys
assert "mean_test_score" in cv_result_keys
assert "std_test_score" in cv_result_keys
assert "rank_test_score" in cv_result_keys
assert "mean_train_score" in cv_result_keys
assert "std_train_score" in cv_result_keys
assert "rank_train_score" in cv_result_keys
assert "std_fit_time" in cv_result_keys
assert "mean_score_time" in cv_result_keys
assert "rank_n_features" in cv_result_keys
assert "features" in cv_result_keys

# delete dumped estimator
os.remove(dump_file)

0 comments on commit a19c6d6

Please sign in to comment.