Skip to content

Commit

Permalink
shap selector refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
antonkulaga committed Sep 19, 2020
1 parent 3fbbc63 commit b084fd1
Show file tree
Hide file tree
Showing 10 changed files with 214 additions and 2,842 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
*.class
*.log
logs
.~*

#VIM spegcific
tags
Expand Down
1 change: 1 addition & 0 deletions data/interim/optimization/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
/mass_kg.sqlite
/gestation_days.sqlite
/lifespan_2.sqlite
/temperature.sqlite
Binary file removed data/interim/optimization/temperature.sqlite
Binary file not shown.
3 changes: 3 additions & 0 deletions data/interim/optimization/temperature.sqlite.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
outs:
- md5: 2fecb538190048e43e4718a7d19a6020
path: temperature.sqlite
2 changes: 1 addition & 1 deletion environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies:
- shap>=0.35.0
- lightgbm>=3.0.0
- statsmodels>=0.11.1
- optuna>=2.0.0
- optuna>=2.1.0
- click>=7.1.2
- loguru>=0.5.0
#- yspecies=0.2.3
Expand Down
2,790 changes: 30 additions & 2,760 deletions notebooks/stage_one_shap_selection.ipynb

Large diffs are not rendered by default.

90 changes: 90 additions & 0 deletions yspecies/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from functools import cached_property

from sklearn.pipeline import Pipeline

from yspecies.config import *
from yspecies.partition import DataPartitioner
from yspecies.partition import PartitionParameters
from yspecies.preprocess import DataExtractor
from yspecies.results import FeatureSummary
from yspecies.selection import CrossValidator, ShapSelector
from yspecies.tuning import MultiObjectiveResults
from yspecies.workflow import TupleWith, Repeat, Collect


@dataclass
class PipelineFactory:
locations: Locations
repeats: int = 10
n_folds: int = 5
n_hold_out: int = 1

@cached_property
def partition_parameters(self):
return PartitionParameters(self.n_folds, self.n_hold_out, 2, 42)

def load_study_by_trait(self, trait: str, study_name: str = None):
path = self.locations.interim.optimization / (trait+".sqlite")
study_name = f"{trait}_r2_huber_kendall" if study_name is None else study_name
return self.load_study(path, study_name)

def load_study(self, path: Path, name: str):
url = f'sqlite:///' +str(path.absolute())
print('loading (if exists) study from '+url)
storage = optuna.storages.RDBStorage(
url=url
#engine_kwargs={'check_same_thread': False}
)
return optuna.multi_objective.study.create_study(directions=['maximize', 'minimize', 'maximize'], storage=storage, study_name=name, load_if_exists = True)

def make_partitioning_shap_pipeline(self, trait: str, study_name: str = None):
study_name = f"{trait}_r2_huber_kendall" if study_name is None else study_name
study = self.load_study_by_trait(trait, study_name)
if len(study.get_pareto_front_trials())>0 :
metrics, params = MultiObjectiveResults.from_study(study).best_metrics_params_r2()
params["verbose"] = -1
if "early_stopping_round" not in params:
params["early_stopping_round"] = 10
else:
params = {"bagging_fraction": 0.9522534844058304,
"boosting_type": "dart",
"objective": "regression",
"feature_fraction": 0.42236910941558053,
"lambda_l1": 0.020847266580277746,
"lambda_l2": 2.8448564854773326,
"learning_rate": 0.11484015430016059,
"max_depth": 3,
"max_leaves": 35,
"min_data_in_leaf": 9,
"num_iterations": 250,
"metrics": ["l1", "l2", "huber"]
}
return Pipeline([
("partitioner", DataPartitioner()),
('prepare_for_selection', TupleWith(params)),
("cross_validation", CrossValidator()),
("shap_computation", ShapSelector())
]
)

def make_shap_pipeline(self, trait: str, study_name: str = None):
partition_shap_pipe = self.make_partitioning_shap_pipeline(trait, study_name)
return Pipeline(
[
('extractor', DataExtractor()),
('prepare_for_partitioning', TupleWith(self.partition_parameters)), # to extract the data required for ML from the dataset
("partition_shap", partition_shap_pipe)
]
)

def make_repeated_shap_pipeline(self, trait: str, study_name: str = None):
partition_shap_pipe = self.make_partitioning_shap_pipeline(trait, study_name)
repeated_cv = Repeat(partition_shap_pipe, self.repeats, lambda x,i: (x[0], replace(x[1], seed=i)))
return Pipeline(
[
('extractor', DataExtractor()),
('prepare_for_partitioning', TupleWith(self.partition_parameters)), # to extract the data required for ML from the dataset
("repeated_partition_shap", repeated_cv),
("summarize", Collect(fold=lambda results: FeatureSummary(results)))
]
)
2 changes: 1 addition & 1 deletion yspecies/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def _repr_html_(self):


@dataclass
class CrossValidator(TransformerMixin):
class BasicCrossValidator(TransformerMixin):

evaluation: ResultsCV = None
num_iterations: int = 200
Expand Down
148 changes: 79 additions & 69 deletions yspecies/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,75 @@
from yspecies.partition import ExpressionPartitions
from pathlib import Path


@dataclass(frozen=True)
class Fold:
'''
Class to contain information about the fold, useful for reproducibility
'''
feature_weights: np.ndarray
shap_dataframe: pd.DataFrame
metrics: Metrics
validation_species: List = field(default_factory=lambda: [])
validation_metrics: Metrics = None
booster: Booster = None
eval: List[BasicMetrics] = field(default_factory=lambda: [])
num: int
model: Booster
partitions: ExpressionPartitions
current_evals: List[BasicMetrics] = field(default_factory=lambda: [])

@cached_property
def explainer(self) -> shap.TreeExplainer:
return shap.TreeExplainer(self.model, feature_perturbation=self.partitions.features.feature_perturbation, data=self.partitions.X)

@cached_property
def shap_values(self):
return self.explainer.shap_values(self.partitions.X)

@cached_property
def feature_weights(self) -> np.ndarray:
return self.model.feature_importance(importance_type=self.partitions.features.importance_type)

@cached_property
def shap_dataframe(self) -> pd.DataFrame:
return pd.DataFrame(data=self.shap_values, index=self.partitions.X.index, columns=self.partitions.X.columns)

@cached_property
def validation_species(self):
return self.partitions.validation_species[self.num]

@cached_property
def _fold_train(self):
return self.partitions.fold_train(self.num)

@property
def X_train(self):
return self._fold_train[0]

def y_train(self):
return self._fold_train[1]

@cached_property
def X_test(self):
return self.partitions.partitions_x[self.num]

@cached_property
def y_test(self):
return self.partitions.partitions_y[self.num]

@cached_property
def fold_predictions(self):
return self.model.predict(self.X_test)

@cached_property
def validation_metrics(self):
return self.model.predict(self.partitions.hold_out_x) if self.partitions.n_hold_out > 0 else None

@cached_property
def metrics(self):
return Metrics.calculate(self.y_test, self.fold_predictions, self.eval_metrics.huber)

@cached_property
def eval_metrics(self):
best_iteration_num = self.model.best_iteration
eval_last_num = len(self.current_evals) -1
metrics_num = best_iteration_num if best_iteration_num is not None and best_iteration_num < eval_last_num and best_iteration_num >= 0 else eval_last_num

return self.current_evals[metrics_num] if self.current_evals[metrics_num].huber < self.current_evals[eval_last_num].huber else self.current_evals[self.eval_last_num]

@cached_property
def shap_values(self) -> List[np.ndarray]:
Expand Down Expand Up @@ -58,15 +115,14 @@ def _repr_html_(self):

from yspecies.results import FeatureResults


@dataclass
class ShapSelector(TransformerMixin):
class CrossValidator(TransformerMixin):
early_stopping_rounds: int = 10
models: List = field(default_factory=lambda: [])
evals: List = field(default_factory=lambda: [])

@logger.catch
def fit(self, to_fit: Tuple[ExpressionPartitions, Dict], y=None) -> 'DataExtractor':
def fit(self, to_fit: Tuple[ExpressionPartitions, Dict], y=None) -> 'CrossValidator':
"""
:param to_fit: (partitions, parameters)
Expand Down Expand Up @@ -117,70 +173,24 @@ def regression_model(self, X_train, X_test, y_train, y_test, parameters: Dict, c
)
return gbm, BasicMetrics.parse_eval(evals_result)

def compute_folds(self, partitions: ExpressionPartitions) -> List[Fold]:
'''
Subfunction to compute weight_of_features, shap_values_out_of_fold, metrics_out_of_fold
:param partitions:
:return:
'''

# shap_values_out_of_fold = np.zeros()
# interaction_values_out_of_fold = [[[0 for i in range(len(X.values[0]))] for i in range(len(X.values[0]))] for z in range(len(X))]
# metrics = pd.DataFrame(np.zeros([folds, 3]), columns=["R^2", "MSE", "MAE"])
# .sum(axis=0)
@logger.catch
def transform(self, to_select_from: Tuple[ExpressionPartitions, Dict]) -> Tuple[List[Fold], Dict]:
partitions, parameters = to_select_from
assert len(self.models) == partitions.n_cv_folds, "for each bootstrap there should be a model"
folds = [Fold(i, self.models[i], partitions, self.evals[i]) for i in range(0, partitions.n_cv_folds)]
return (folds, parameters)

@dataclass
class ShapSelector(TransformerMixin):

result = []

X_hold_out = partitions.hold_out_x
Y_hold_out = partitions.hold_out_y
cat = partitions.categorical_index if partitions.categorical_index is not None and len(
partitions.categorical_index) > 0 else "auto"
lgb_hold_out = lgb.Dataset(X_hold_out, Y_hold_out, categorical_feature=cat)

for i in range(0, partitions.n_cv_folds):

X_test = partitions.partitions_x[i]
y_test = partitions.partitions_y[i]
(X_train, y_train) = partitions.fold_train(i)

# get trained model and record accuracy metrics
model: Booster = self.models[i] # just using already trained model
fold_predictions = model.predict(X_test)

if partitions.n_hold_out > 0:
fold_validation_predictions = model.predict(partitions.hold_out_x)

explainer = shap.TreeExplainer(model, feature_perturbation=partitions.features.feature_perturbation, data=partitions.X)
shap_values = explainer.shap_values(partitions.X)
best_iteration_num = model.best_iteration
current_evals = self.evals[i]
eval_last_num = len(current_evals) -1
metrics_num = best_iteration_num if best_iteration_num is not None and best_iteration_num < eval_last_num and best_iteration_num >= 0 else eval_last_num
best_metrics = current_evals[metrics_num] if current_evals[metrics_num].huber < current_evals[eval_last_num].huber else current_evals[eval_last_num]
f = Fold(feature_weights=model.feature_importance(importance_type=partitions.features.importance_type),
shap_dataframe=pd.DataFrame(data=shap_values, index=partitions.X.index,
columns=partitions.X.columns),
metrics=Metrics.calculate(y_test, fold_predictions, best_metrics.huber),
validation_metrics=Metrics.calculate(Y_hold_out,
fold_validation_predictions) if partitions.n_hold_out > 0 else None,
validation_species=partitions.validation_species[i],
booster=model,
eval=best_metrics
)
result.append(f)

# interaction_values = explainer.shap_interaction_values(X)
# shap_values_out_of_fold = np.add(shap_values_out_of_fold, shap_values)
# interaction_values_out_of_fold = np.add(interaction_values_out_of_fold, interaction_values)
return result
def fit(self, folds_with_params: Tuple[List[Fold], Dict], y=None) -> 'ShapSelector':
return self

@logger.catch
def transform(self, to_select_from: Tuple[ExpressionPartitions, Dict]) -> FeatureResults:

partitions, parameters = to_select_from
folds = self.compute_folds(partitions)
def transform(self, folds_with_params: Tuple[List[Fold], Dict]) -> FeatureResults:
folds, parameters = folds_with_params
fold_shap_values = [f.shap_values for f in folds]
partitions = folds[0].partitions
# calculate shap values out of fold
mean_shap_values = np.nanmean(fold_shap_values, axis=0)
shap_values_transposed = mean_shap_values.T
Expand Down
19 changes: 8 additions & 11 deletions yspecies/tuning.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
import lightgbm as lgb
from dataclasses import *
from functools import cached_property

import lightgbm as lgb
import optuna
from optuna import Trial
from optuna.multi_objective import trial
from optuna.multi_objective.study import MultiObjectiveStudy
from optuna.multi_objective.trial import FrozenMultiObjectiveTrial
from sklearn.base import TransformerMixin
from dataclasses import *

from sklearn.pipeline import Pipeline

from yspecies.models import Metrics, CrossValidator, ResultsCV
from yspecies.models import Metrics, ResultsCV, BasicCrossValidator
from yspecies.partition import ExpressionPartitions
from yspecies.utils import *

import optuna
from optuna import Study, Trial
from optuna import multi_objective
from loguru import logger
from optuna.multi_objective import trial
from optuna.multi_objective.study import MultiObjectiveStudy

@dataclass(frozen=True)
class SpecializedTuningResults:
Expand Down Expand Up @@ -128,7 +125,7 @@ def results(self) -> Dict:

@dataclass(frozen=False)
class Tune(TransformerMixin):
transformer: Union[Union[TransformerMixin, Pipeline], CrossValidator]
transformer: Union[Union[TransformerMixin, Pipeline], BasicCrossValidator]
n_trials: int
def objective_parameters(trial: Trial) -> dict:
return {
Expand Down

0 comments on commit b084fd1

Please sign in to comment.