set golden features number (mljar#378)

ajschumacher · Apr 26, 2021 · 38917dc · 38917dc
1 parent f618d7d
commit 38917dc
Show file tree

Hide file tree

Showing 10 changed files with 200 additions and 27 deletions.
diff --git a/supervised/automl.py b/supervised/automl.py
@@ -148,13 +148,18 @@ def __init__(
 
                 If left `auto` AutoML will produce explanations based on the selected `mode`.
 
-            golden_features (boolean): Whether to use golden features
+            golden_features (boolean or int): Whether to use golden features (and how many should be added)
                 If left `auto` AutoML will use golden features based on the selected `mode`:
 
                 - If `mode` is "Explain", `golden_features` = False.
                 - If `mode` is "Perform", `golden_features` = True.
                 - If `mode` is "Compete", `golden_features` = True.
 
+                If `boolean` value is set then the number of Golden Features is set automatically. 
+                It is set to min(100, max(10, 0.1*number_of_input_features)).
+
+                If `int` value is set, the number of Golden Features is set to this value.
+
             features_selection (boolean): Whether to do features_selection
                 If left `auto` AutoML will do feature selection based on the selected `mode`:
 
@@ -325,7 +330,7 @@ def fit(self, X, y, sample_weight=None, cv=None):
 
             cv (iterable or list): List or iterable with (train, validation) splits representing array of indices.
             It is used only with custom validation (`validation_strategy={'validation_type': 'custom'}`).
-            
+
         Returns:
             AutoML object: Returns `self`
         """

diff --git a/supervised/base_automl.py b/supervised/base_automl.py
@@ -555,11 +555,10 @@ def _save_data(self, X, y, sample_weight=None, cv=None):
             self._validation_strategy["sample_weight_path"] = self._sample_weight_path
 
         if cv is not None:
-            self._validation_strategy["cv_path"] = os.path.join(self._results_path, "cv.data")
-            joblib.dump(
-                cv,
-                self._validation_strategy["cv_path"]
+            self._validation_strategy["cv_path"] = os.path.join(
+                self._results_path, "cv.data"
             )
+            joblib.dump(cv, self._validation_strategy["cv_path"])
 
         if self._max_single_prediction_time is not None:
             self._one_sample = X.iloc[:1].copy(deep=True)
@@ -1866,6 +1865,8 @@ def _validate_golden_features(self):
         """ Validates golden_features parameter"""
         if isinstance(self.golden_features, str) and self.golden_features == "auto":
             return
+        if isinstance(self.golden_features, int):
+            return
         check_bool(self.golden_features, "golden_features")
 
     def _validate_features_selection(self):

diff --git a/supervised/preprocessing/goldenfeatures_transformer.py b/supervised/preprocessing/goldenfeatures_transformer.py
@@ -104,10 +104,11 @@ def get_score(item):
 
 
 class GoldenFeaturesTransformer(object):
-    def __init__(self, results_path=None, ml_task=None):
+    def __init__(self, results_path=None, ml_task=None, features_count=None):
         self._new_features = []
         self._new_columns = []
         self._ml_task = ml_task
+        self._features_count = features_count
         self._scorer = None
         if self._ml_task == BINARY_CLASSIFICATION:
             self._scorer = get_binary_score
@@ -184,6 +185,14 @@ def fit(self, X, y):
 
         new_cols_cnt = np.min([100, np.max([10, int(0.1 * X.shape[1])])])
 
+        if (
+            self._features_count is not None
+            and self._features_count > 0
+            and self._features_count < df.shape[0]
+        ):
+            new_cols_cnt = self._features_count
+
+        print(self._features_count, new_cols_cnt)
         self._new_features = json.loads(df.head(new_cols_cnt).to_json(orient="records"))
 
         for new_feature in self._new_features:

diff --git a/supervised/preprocessing/preprocessing.py b/supervised/preprocessing/preprocessing.py
@@ -185,7 +185,10 @@ def fit_and_transform(self, X_train, y_train, sample_weight=None):
         if "golden_features" in self._params:
             results_path = self._params["golden_features"]["results_path"]
             ml_task = self._params["golden_features"]["ml_task"]
-            self._golden_features = GoldenFeaturesTransformer(results_path, ml_task)
+            features_count = self._params["golden_features"].get("features_count")
+            self._golden_features = GoldenFeaturesTransformer(
+                results_path, ml_task, features_count
+            )
             self._golden_features.fit(X_train[numeric_cols], y_train)
             X_train = self._golden_features.transform(X_train)
             golden_columns = self._golden_features._new_columns

diff --git a/supervised/preprocessing/preprocessing_utils.py b/supervised/preprocessing/preprocessing_utils.py
@@ -36,7 +36,6 @@ def get_type(x):
             # treat it as categorical
             return PreprocessingUtils.CATEGORICAL
 
-
         if data_type == PreprocessingUtils.CATEGORICAL:
             # check maybe this categorical is a text
             # it is a text, if:

diff --git a/supervised/tuner/mljar_tuner.py b/supervised/tuner/mljar_tuner.py
@@ -742,6 +742,15 @@ def get_golden_features_params(
                 "results_path": results_path,
                 "ml_task": self._ml_task,
             }
+            if (
+                self._golden_features is not None
+                and not isinstance(self._golden_features, bool)
+                and isinstance(self._golden_features, int)
+            ):
+                params["preprocessing"]["golden_features"][
+                    "features_count"
+                ] = self._golden_features
+
             params["name"] += "_GoldenFeatures"
             params["status"] = "initialized"
             params["final_loss"] = None

diff --git a/supervised/validation/validation_step.py b/supervised/validation/validation_step.py
@@ -25,7 +25,7 @@ def __init__(self, params):
             raise AutoMLException(
                 f"The validation type ({self.validation_type}) is not implemented."
             )
-        
+
     def get_split(self, k, repeat=0):
         return self.validator.get_split(k, repeat)
 

diff --git a/supervised/validation/validator_custom.py b/supervised/validation/validator_custom.py
@@ -23,11 +23,9 @@ def __init__(self, params):
         BaseValidator.__init__(self, params)
 
         cv_path = self.params.get("cv_path")
-        
+
         if cv_path is None:
-            raise AutoMLException(
-                "You need to specify `cv` as list or iterable"
-            )
+            raise AutoMLException("You need to specify `cv` as list or iterable")
 
         self.cv = joblib.load(cv_path)
         self.cv = list(self.cv)
@@ -48,14 +46,10 @@ def __init__(self, params):
 
             print("Custom validation strategy")
             for fold_cnt, (train_index, validation_index) in enumerate(self.cv):
-                
+
                 print(f"Split {fold_cnt}.")
-                print(
-                    f"Train {train_index.shape[0]} samples."
-                )
-                print(
-                    f"Validation {validation_index.shape[0]} samples."
-                )
+                print(f"Train {train_index.shape[0]} samples.")
+                print(f"Validation {validation_index.shape[0]} samples.")
                 train_index_file = os.path.join(
                     self._results_path,
                     "folds",
@@ -88,19 +82,23 @@ def get_split(self, k, repeat=0):
             X = load_data(self._X_path)
             y = load_data(self._y_path)
             y = y["target"]
-            
+
             sample_weight = None
             if self._sample_weight_path is not None:
                 sample_weight = load_data(self._sample_weight_path)
                 sample_weight = sample_weight["sample_weight"]
 
             train_data = {"X": X.iloc[train_index], "y": y.iloc[train_index]}
-            validation_data = {"X": X.iloc[validation_index], "y": y.iloc[validation_index]}
+            validation_data = {
+                "X": X.iloc[validation_index],
+                "y": y.iloc[validation_index],
+            }
             if sample_weight is not None:
                 train_data["sample_weight"] = sample_weight.iloc[train_index]
                 validation_data["sample_weight"] = sample_weight.iloc[validation_index]
         except Exception as e:
             import traceback
+
             print(traceback.format_exc())
             raise AutoMLException("Problem with custom validation. " + str(e))
         return (train_data, validation_data)

diff --git a/tests/tests_automl/test_golden_features.py b/tests/tests_automl/test_golden_features.py
@@ -0,0 +1,120 @@
+import os
+import unittest
+import tempfile
+import json
+import numpy as np
+import pandas as pd
+import shutil
+from supervised import AutoML
+from numpy.testing import assert_almost_equal
+from sklearn import datasets
+from supervised.exceptions import AutoMLException
+
+class AutoMLGoldenFeaturesTest(unittest.TestCase):
+
+    automl_dir = "automl_tests"
+    rows = 50
+
+    def tearDown(self):
+        shutil.rmtree(self.automl_dir, ignore_errors=True)
+
+    def test_no_golden_features(self):
+
+        N_COLS = 10
+        X, y = datasets.make_classification(
+            n_samples=100,
+            n_features=N_COLS,
+            n_informative=6,
+            n_redundant=1,
+            n_classes=2,
+            n_clusters_per_class=3,
+            n_repeated=0,
+            shuffle=False,
+            random_state=0,
+        )
+
+        X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
+
+        automl = AutoML(
+            results_path=self.automl_dir,
+            total_time_limit=1,
+            algorithms=["Xgboost"],
+            train_ensemble=False,
+            golden_features=False,
+            explain_level=0,
+            start_random_models=1,
+        )
+        automl.fit(X, y)
+
+        self.assertEqual(len(automl._models), 1)
+
+    def test_golden_features(self):
+
+        N_COLS = 10
+        X, y = datasets.make_classification(
+            n_samples=100,
+            n_features=N_COLS,
+            n_informative=6,
+            n_redundant=1,
+            n_classes=2,
+            n_clusters_per_class=3,
+            n_repeated=0,
+            shuffle=False,
+            random_state=0,
+        )
+
+        X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
+
+        automl = AutoML(
+            results_path=self.automl_dir,
+            total_time_limit=10,
+            algorithms=["Xgboost"],
+            train_ensemble=False,
+            golden_features=True,
+            explain_level=0,
+            start_random_models=1,
+        )
+        automl.fit(X, y)
+
+        self.assertEqual(len(automl._models), 2)
+
+        # there should be 10 golden features
+        with open(os.path.join(self.automl_dir, "golden_features.json")) as fin:
+            d = json.loads(fin.read())
+            self.assertEqual(len(d["new_features"]), 10)
+
+
+    def test_golden_features_count(self):
+
+        N_COLS = 10
+        X, y = datasets.make_classification(
+            n_samples=100,
+            n_features=N_COLS,
+            n_informative=6,
+            n_redundant=1,
+            n_classes=2,
+            n_clusters_per_class=3,
+            n_repeated=0,
+            shuffle=False,
+            random_state=0,
+        )
+
+        X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
+
+        automl = AutoML(
+            results_path=self.automl_dir,
+            total_time_limit=10,
+            algorithms=["Xgboost"],
+            train_ensemble=False,
+            golden_features=50,
+            explain_level=0,
+            start_random_models=1,
+        )
+        automl.fit(X, y)
+
+        self.assertEqual(len(automl._models), 2)
+
+        # there should be 50 golden features
+        with open(os.path.join(self.automl_dir, "golden_features.json")) as fin:
+            d = json.loads(fin.read())
+            self.assertEqual(len(d["new_features"]), 50)
diff --git a/tests/tests_preprocessing/test_goldenfeatures_transformer.py b/tests/tests_preprocessing/test_goldenfeatures_transformer.py
@@ -38,16 +38,12 @@ def test_transformer(self):
         )
 
         df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
-        print(df)
 
         with tempfile.TemporaryDirectory() as tmpdir:
             gft = GoldenFeaturesTransformer(tmpdir, "binary_classification")
             gft.fit(df, y)
 
             df = gft.transform(df)
-            print(df)
-
-            print(gft.to_json())
 
             gft3 = GoldenFeaturesTransformer(tmpdir, "binary_classification")
             gft3.from_json(gft.to_json(), tmpdir)
@@ -138,3 +134,36 @@ def test_subsample_binclass_4k(self):
         for uni in [np.unique(y_train), np.unique(y_test)]:
             for i in range(2):
                 self.assertTrue(i in uni)
+
+
+    def test_features_count(self):
+
+        N_COLS = 10
+        X, y = datasets.make_classification(
+            n_samples=100,
+            n_features=N_COLS,
+            n_informative=6,
+            n_redundant=1,
+            n_classes=2,
+            n_clusters_per_class=3,
+            n_repeated=0,
+            shuffle=False,
+            random_state=0,
+        )
+
+        df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
+
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            FEATURES_COUNT = 42
+            gft = GoldenFeaturesTransformer(tmpdir, "binary_classification", features_count = FEATURES_COUNT)
+            gft.fit(df, y)
+
+            self.assertEqual(len(gft._new_features), FEATURES_COUNT)
+
+            gft3 = GoldenFeaturesTransformer(tmpdir, "binary_classification")
+            gft3.from_json(gft.to_json(), tmpdir)
+
+            df = gft3.transform(df)
+            self.assertEqual(df.shape[1], N_COLS + FEATURES_COUNT)
+