Skip to content

Commit

Permalink
set golden features number (mljar#378)
Browse files Browse the repository at this point in the history
  • Loading branch information
pplonski committed Apr 26, 2021
1 parent f618d7d commit 38917dc
Show file tree
Hide file tree
Showing 10 changed files with 200 additions and 27 deletions.
9 changes: 7 additions & 2 deletions supervised/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,13 +148,18 @@ def __init__(
If left `auto` AutoML will produce explanations based on the selected `mode`.
golden_features (boolean): Whether to use golden features
golden_features (boolean or int): Whether to use golden features (and how many should be added)
If left `auto` AutoML will use golden features based on the selected `mode`:
- If `mode` is "Explain", `golden_features` = False.
- If `mode` is "Perform", `golden_features` = True.
- If `mode` is "Compete", `golden_features` = True.
If `boolean` value is set then the number of Golden Features is set automatically.
It is set to min(100, max(10, 0.1*number_of_input_features)).
If `int` value is set, the number of Golden Features is set to this value.
features_selection (boolean): Whether to do features_selection
If left `auto` AutoML will do feature selection based on the selected `mode`:
Expand Down Expand Up @@ -325,7 +330,7 @@ def fit(self, X, y, sample_weight=None, cv=None):
cv (iterable or list): List or iterable with (train, validation) splits representing array of indices.
It is used only with custom validation (`validation_strategy={'validation_type': 'custom'}`).
Returns:
AutoML object: Returns `self`
"""
Expand Down
9 changes: 5 additions & 4 deletions supervised/base_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,11 +555,10 @@ def _save_data(self, X, y, sample_weight=None, cv=None):
self._validation_strategy["sample_weight_path"] = self._sample_weight_path

if cv is not None:
self._validation_strategy["cv_path"] = os.path.join(self._results_path, "cv.data")
joblib.dump(
cv,
self._validation_strategy["cv_path"]
self._validation_strategy["cv_path"] = os.path.join(
self._results_path, "cv.data"
)
joblib.dump(cv, self._validation_strategy["cv_path"])

if self._max_single_prediction_time is not None:
self._one_sample = X.iloc[:1].copy(deep=True)
Expand Down Expand Up @@ -1866,6 +1865,8 @@ def _validate_golden_features(self):
""" Validates golden_features parameter"""
if isinstance(self.golden_features, str) and self.golden_features == "auto":
return
if isinstance(self.golden_features, int):
return
check_bool(self.golden_features, "golden_features")

def _validate_features_selection(self):
Expand Down
11 changes: 10 additions & 1 deletion supervised/preprocessing/goldenfeatures_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,11 @@ def get_score(item):


class GoldenFeaturesTransformer(object):
def __init__(self, results_path=None, ml_task=None):
def __init__(self, results_path=None, ml_task=None, features_count=None):
self._new_features = []
self._new_columns = []
self._ml_task = ml_task
self._features_count = features_count
self._scorer = None
if self._ml_task == BINARY_CLASSIFICATION:
self._scorer = get_binary_score
Expand Down Expand Up @@ -184,6 +185,14 @@ def fit(self, X, y):

new_cols_cnt = np.min([100, np.max([10, int(0.1 * X.shape[1])])])

if (
self._features_count is not None
and self._features_count > 0
and self._features_count < df.shape[0]
):
new_cols_cnt = self._features_count

print(self._features_count, new_cols_cnt)
self._new_features = json.loads(df.head(new_cols_cnt).to_json(orient="records"))

for new_feature in self._new_features:
Expand Down
5 changes: 4 additions & 1 deletion supervised/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,10 @@ def fit_and_transform(self, X_train, y_train, sample_weight=None):
if "golden_features" in self._params:
results_path = self._params["golden_features"]["results_path"]
ml_task = self._params["golden_features"]["ml_task"]
self._golden_features = GoldenFeaturesTransformer(results_path, ml_task)
features_count = self._params["golden_features"].get("features_count")
self._golden_features = GoldenFeaturesTransformer(
results_path, ml_task, features_count
)
self._golden_features.fit(X_train[numeric_cols], y_train)
X_train = self._golden_features.transform(X_train)
golden_columns = self._golden_features._new_columns
Expand Down
1 change: 0 additions & 1 deletion supervised/preprocessing/preprocessing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ def get_type(x):
# treat it as categorical
return PreprocessingUtils.CATEGORICAL


if data_type == PreprocessingUtils.CATEGORICAL:
# check maybe this categorical is a text
# it is a text, if:
Expand Down
9 changes: 9 additions & 0 deletions supervised/tuner/mljar_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,15 @@ def get_golden_features_params(
"results_path": results_path,
"ml_task": self._ml_task,
}
if (
self._golden_features is not None
and not isinstance(self._golden_features, bool)
and isinstance(self._golden_features, int)
):
params["preprocessing"]["golden_features"][
"features_count"
] = self._golden_features

params["name"] += "_GoldenFeatures"
params["status"] = "initialized"
params["final_loss"] = None
Expand Down
2 changes: 1 addition & 1 deletion supervised/validation/validation_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(self, params):
raise AutoMLException(
f"The validation type ({self.validation_type}) is not implemented."
)

def get_split(self, k, repeat=0):
return self.validator.get_split(k, repeat)

Expand Down
24 changes: 11 additions & 13 deletions supervised/validation/validator_custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,9 @@ def __init__(self, params):
BaseValidator.__init__(self, params)

cv_path = self.params.get("cv_path")

if cv_path is None:
raise AutoMLException(
"You need to specify `cv` as list or iterable"
)
raise AutoMLException("You need to specify `cv` as list or iterable")

self.cv = joblib.load(cv_path)
self.cv = list(self.cv)
Expand All @@ -48,14 +46,10 @@ def __init__(self, params):

print("Custom validation strategy")
for fold_cnt, (train_index, validation_index) in enumerate(self.cv):

print(f"Split {fold_cnt}.")
print(
f"Train {train_index.shape[0]} samples."
)
print(
f"Validation {validation_index.shape[0]} samples."
)
print(f"Train {train_index.shape[0]} samples.")
print(f"Validation {validation_index.shape[0]} samples.")
train_index_file = os.path.join(
self._results_path,
"folds",
Expand Down Expand Up @@ -88,19 +82,23 @@ def get_split(self, k, repeat=0):
X = load_data(self._X_path)
y = load_data(self._y_path)
y = y["target"]

sample_weight = None
if self._sample_weight_path is not None:
sample_weight = load_data(self._sample_weight_path)
sample_weight = sample_weight["sample_weight"]

train_data = {"X": X.iloc[train_index], "y": y.iloc[train_index]}
validation_data = {"X": X.iloc[validation_index], "y": y.iloc[validation_index]}
validation_data = {
"X": X.iloc[validation_index],
"y": y.iloc[validation_index],
}
if sample_weight is not None:
train_data["sample_weight"] = sample_weight.iloc[train_index]
validation_data["sample_weight"] = sample_weight.iloc[validation_index]
except Exception as e:
import traceback

print(traceback.format_exc())
raise AutoMLException("Problem with custom validation. " + str(e))
return (train_data, validation_data)
Expand Down
120 changes: 120 additions & 0 deletions tests/tests_automl/test_golden_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import os
import unittest
import tempfile
import json
import numpy as np
import pandas as pd
import shutil
from supervised import AutoML
from numpy.testing import assert_almost_equal
from sklearn import datasets
from supervised.exceptions import AutoMLException

class AutoMLGoldenFeaturesTest(unittest.TestCase):

automl_dir = "automl_tests"
rows = 50

def tearDown(self):
shutil.rmtree(self.automl_dir, ignore_errors=True)

def test_no_golden_features(self):

N_COLS = 10
X, y = datasets.make_classification(
n_samples=100,
n_features=N_COLS,
n_informative=6,
n_redundant=1,
n_classes=2,
n_clusters_per_class=3,
n_repeated=0,
shuffle=False,
random_state=0,
)

X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])

automl = AutoML(
results_path=self.automl_dir,
total_time_limit=1,
algorithms=["Xgboost"],
train_ensemble=False,
golden_features=False,
explain_level=0,
start_random_models=1,
)
automl.fit(X, y)

self.assertEqual(len(automl._models), 1)

def test_golden_features(self):

N_COLS = 10
X, y = datasets.make_classification(
n_samples=100,
n_features=N_COLS,
n_informative=6,
n_redundant=1,
n_classes=2,
n_clusters_per_class=3,
n_repeated=0,
shuffle=False,
random_state=0,
)

X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])

automl = AutoML(
results_path=self.automl_dir,
total_time_limit=10,
algorithms=["Xgboost"],
train_ensemble=False,
golden_features=True,
explain_level=0,
start_random_models=1,
)
automl.fit(X, y)

self.assertEqual(len(automl._models), 2)

# there should be 10 golden features
with open(os.path.join(self.automl_dir, "golden_features.json")) as fin:
d = json.loads(fin.read())
self.assertEqual(len(d["new_features"]), 10)


def test_golden_features_count(self):

N_COLS = 10
X, y = datasets.make_classification(
n_samples=100,
n_features=N_COLS,
n_informative=6,
n_redundant=1,
n_classes=2,
n_clusters_per_class=3,
n_repeated=0,
shuffle=False,
random_state=0,
)

X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])

automl = AutoML(
results_path=self.automl_dir,
total_time_limit=10,
algorithms=["Xgboost"],
train_ensemble=False,
golden_features=50,
explain_level=0,
start_random_models=1,
)
automl.fit(X, y)

self.assertEqual(len(automl._models), 2)

# there should be 50 golden features
with open(os.path.join(self.automl_dir, "golden_features.json")) as fin:
d = json.loads(fin.read())
self.assertEqual(len(d["new_features"]), 50)
37 changes: 33 additions & 4 deletions tests/tests_preprocessing/test_goldenfeatures_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,12 @@ def test_transformer(self):
)

df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
print(df)

with tempfile.TemporaryDirectory() as tmpdir:
gft = GoldenFeaturesTransformer(tmpdir, "binary_classification")
gft.fit(df, y)

df = gft.transform(df)
print(df)

print(gft.to_json())

gft3 = GoldenFeaturesTransformer(tmpdir, "binary_classification")
gft3.from_json(gft.to_json(), tmpdir)
Expand Down Expand Up @@ -138,3 +134,36 @@ def test_subsample_binclass_4k(self):
for uni in [np.unique(y_train), np.unique(y_test)]:
for i in range(2):
self.assertTrue(i in uni)


def test_features_count(self):

N_COLS = 10
X, y = datasets.make_classification(
n_samples=100,
n_features=N_COLS,
n_informative=6,
n_redundant=1,
n_classes=2,
n_clusters_per_class=3,
n_repeated=0,
shuffle=False,
random_state=0,
)

df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])


with tempfile.TemporaryDirectory() as tmpdir:
FEATURES_COUNT = 42
gft = GoldenFeaturesTransformer(tmpdir, "binary_classification", features_count = FEATURES_COUNT)
gft.fit(df, y)

self.assertEqual(len(gft._new_features), FEATURES_COUNT)

gft3 = GoldenFeaturesTransformer(tmpdir, "binary_classification")
gft3.from_json(gft.to_json(), tmpdir)

df = gft3.transform(df)
self.assertEqual(df.shape[1], N_COLS + FEATURES_COUNT)

0 comments on commit 38917dc

Please sign in to comment.