-
Notifications
You must be signed in to change notification settings - Fork 299
[ADD] Minority Coalescer #242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d85c16e
ae0652e
90c71eb
43213bb
e107e93
e40ed30
3d25f37
2e08e99
c8aa626
afd4f00
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from typing import Any, Dict, Optional, Union | ||
|
||
from ConfigSpace.configuration_space import ConfigurationSpace | ||
from ConfigSpace.hyperparameters import ( | ||
UniformFloatHyperparameter, | ||
) | ||
|
||
import numpy as np | ||
|
||
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer | ||
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter | ||
from autoPyTorch.utils.implementations import MinorityCoalescing | ||
|
||
|
||
class MinorityCoalescer(BaseCoalescer): | ||
""" | ||
Groups together classes in a categorical feature if the frequency | ||
of occurrence is less than minimum_fraction | ||
""" | ||
def __init__(self, minimum_fraction: float, random_state: np.random.RandomState): | ||
super().__init__() | ||
self.minimum_fraction = minimum_fraction | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we have a convention for this, and in this case, having the complete word is more clear. If possible I would like to preserve it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pytorch uses |
||
self.random_state = random_state | ||
|
||
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer: | ||
|
||
self.check_requirements(X, y) | ||
|
||
self.preprocessor['categorical'] = MinorityCoalescing(minimum_fraction=self.minimum_fraction) | ||
return self | ||
|
||
@staticmethod | ||
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: | ||
return { | ||
'shortname': 'MinorityCoalescer', | ||
'name': 'Minority Feature-class coalescer', | ||
'handles_sparse': False | ||
franchuterivera marked this conversation as resolved.
Show resolved
Hide resolved
franchuterivera marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
@staticmethod | ||
def get_hyperparameter_search_space( | ||
dataset_properties: Optional[Dict] = None, | ||
minimum_fraction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="minimum_fraction", | ||
value_range=(0.0001, 0.5), | ||
default_value=0.01, | ||
log=True), | ||
) -> ConfigurationSpace: | ||
cs = ConfigurationSpace() | ||
|
||
add_hyperparameter(cs, minimum_fraction, UniformFloatHyperparameter) | ||
|
||
return cs |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from typing import Any, Dict, Optional, Union | ||
|
||
import numpy as np | ||
|
||
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer | ||
|
||
|
||
class NoCoalescer(BaseCoalescer): | ||
""" | ||
Don't perform NoCoalescer on categorical features | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a The choice object selects between MinorityCoalescer and NoCoalescer depending on what gives better performance. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean i did not get if you mean |
||
""" | ||
def __init__(self, | ||
random_state: np.random.RandomState, | ||
): | ||
super().__init__() | ||
self.random_state = random_state | ||
|
||
def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseCoalescer: | ||
""" | ||
As no coalescing happens, the input fit dictionary is unchanged. | ||
|
||
Args: | ||
X (Dict[str, Any]): | ||
input fit dictionary | ||
y (Optional[Any]): | ||
Parameter to comply with scikit-learn API. Not used. | ||
|
||
Returns: | ||
instance of self | ||
""" | ||
self.check_requirements(X, y) | ||
|
||
return self | ||
|
||
def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: | ||
""" | ||
Add self into the 'X' dictionary and return the modified dict. | ||
Args: | ||
X (Dict[str, Any]): 'X' dictionary | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Scikit-learn supports passing a dictionary alongside the data. See here It makes a lot of sense to use it instead of X as a fit_dictionary. From all of the refactoring changes, this is to me the most important. When** depends on when there is a contributor that wants to do this change :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry I did not get you, so please add your ideas to the doc-string as well? |
||
|
||
Returns: | ||
(Dict[str, Any]): the updated 'X' dictionary | ||
""" | ||
X.update({'coalescer': self.preprocessor}) | ||
return X | ||
|
||
@staticmethod | ||
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: | ||
return { | ||
'shortname': 'NoCoalescer', | ||
'name': 'No Coalescer', | ||
'handles_sparse': True | ||
franchuterivera marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
import os | ||
from collections import OrderedDict | ||
from typing import Any, Dict, List, Optional | ||
|
||
import ConfigSpace.hyperparameters as CSH | ||
from ConfigSpace.configuration_space import ConfigurationSpace | ||
|
||
from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice | ||
from autoPyTorch.pipeline.components.base_component import ( | ||
ThirdPartyComponents, | ||
autoPyTorchComponent, | ||
find_components, | ||
) | ||
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer | ||
|
||
|
||
coalescer_directory = os.path.split(__file__)[0] | ||
_coalescer = find_components(__package__, | ||
coalescer_directory, | ||
BaseCoalescer) | ||
_addons = ThirdPartyComponents(BaseCoalescer) | ||
|
||
|
||
def add_coalescer(coalescer: BaseCoalescer) -> None: | ||
_addons.add_component(coalescer) | ||
|
||
|
||
class CoalescerChoice(autoPyTorchChoice): | ||
""" | ||
Allows for dynamically choosing coalescer component at runtime | ||
""" | ||
|
||
def get_components(self) -> Dict[str, autoPyTorchComponent]: | ||
"""Returns the available coalescer components | ||
|
||
Args: | ||
None | ||
|
||
Returns: | ||
Dict[str, autoPyTorchComponent]: all BaseCoalescer components available | ||
as choices for coalescer the categorical columns | ||
""" | ||
components = OrderedDict() | ||
components.update(_coalescer) | ||
components.update(_addons.components) | ||
return components | ||
|
||
def get_hyperparameter_search_space(self, | ||
franchuterivera marked this conversation as resolved.
Show resolved
Hide resolved
|
||
dataset_properties: Optional[Dict[str, Any]] = None, | ||
default: Optional[str] = None, | ||
include: Optional[List[str]] = None, | ||
exclude: Optional[List[str]] = None) -> ConfigurationSpace: | ||
cs = ConfigurationSpace() | ||
|
||
if dataset_properties is None: | ||
dataset_properties = dict() | ||
|
||
dataset_properties = {**self.dataset_properties, **dataset_properties} | ||
|
||
available_preprocessors = self.get_available_components(dataset_properties=dataset_properties, | ||
include=include, | ||
exclude=exclude) | ||
|
||
if len(available_preprocessors) == 0: | ||
raise ValueError("No coalescer found, please add a coalescer via the include " | ||
"argument of the pipeline. Additionally, coalescer as a step " | ||
"can be removed as a pipeline step. ") | ||
|
||
if default is None: | ||
defaults = ['NoCoalescer', 'MinorityCoalescer'] | ||
for default_ in defaults: | ||
if default_ in available_preprocessors: | ||
if include is not None and default_ not in include: | ||
continue | ||
if exclude is not None and default_ in exclude: | ||
continue | ||
default = default_ | ||
break | ||
franchuterivera marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
updates = self._get_search_space_updates() | ||
if '__choice__' in updates.keys(): | ||
choice_hyperparameter = updates['__choice__'] | ||
if not set(choice_hyperparameter.value_range).issubset(available_preprocessors): | ||
raise ValueError("The update for {} was expected to be a subset of {} " | ||
"but was {}".format(self.__class__.__name__, | ||
available_preprocessors, | ||
choice_hyperparameter.value_range)) | ||
if len(dataset_properties['categorical_columns']) == 0: | ||
assert len(choice_hyperparameter.value_range) == 1 | ||
assert 'MinorityCoalescer' in choice_hyperparameter.value_range, \ | ||
"Provided {} in choices, however, the dataset " \ | ||
"is incompatible with it".format(choice_hyperparameter.value_range) | ||
|
||
preprocessor = CSH.CategoricalHyperparameter('__choice__', | ||
choice_hyperparameter.value_range, | ||
default_value=choice_hyperparameter.default_value) | ||
else: | ||
# add only no coalescer to choice hyperparameters in case the dataset is only numerical | ||
if len(dataset_properties['categorical_columns']) == 0: | ||
default = 'NoCoalescer' | ||
if include is not None and default not in include: | ||
raise ValueError("Provided coalescer {} are incompatible with " | ||
"the dataset without categorical columns.".format(include)) | ||
preprocessor = CSH.CategoricalHyperparameter('__choice__', | ||
['NoCoalescer'], | ||
default_value=default) | ||
else: | ||
preprocessor = CSH.CategoricalHyperparameter('__choice__', | ||
list(available_preprocessors.keys()), | ||
default_value=default) | ||
|
||
cs.add_hyperparameter(preprocessor) | ||
|
||
# add only child hyperparameters of early_preprocessor choices | ||
for name in preprocessor.choices: | ||
updates = self._get_search_space_updates(prefix=name) | ||
# Call arg is ignored on mypy as the search space dynamically | ||
# provides different args | ||
preprocessor_configuration_space = available_preprocessors[ # type:ignore[call-arg] | ||
name # type:ignore[call-arg] | ||
].get_hyperparameter_search_space(dataset_properties, **updates) # type:ignore[call-arg] | ||
parent_hyperparameter = {'parent': preprocessor, 'value': name} | ||
cs.add_configuration_space(name, preprocessor_configuration_space, | ||
parent_hyperparameter=parent_hyperparameter) | ||
franchuterivera marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
self.configuration_space = cs | ||
self.dataset_properties = dataset_properties | ||
return cs | ||
|
||
def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None: | ||
""" | ||
A mechanism in code to ensure the correctness of the fit dictionary | ||
It recursively makes sure that the children and parent level requirements | ||
are honored before fit. | ||
Args: | ||
dataset_properties: | ||
|
||
""" | ||
super()._check_dataset_properties(dataset_properties) | ||
assert 'numerical_columns' in dataset_properties.keys(), \ | ||
"Dataset properties must contain information about numerical columns" | ||
assert 'categorical_columns' in dataset_properties.keys(), \ | ||
"Dataset properties must contain information about categorical columns" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from typing import Any, Dict, List | ||
|
||
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import ( | ||
autoPyTorchTabularPreprocessingComponent | ||
) | ||
from autoPyTorch.utils.common import FitRequirement | ||
|
||
|
||
class BaseCoalescer(autoPyTorchTabularPreprocessingComponent): | ||
""" | ||
Base class for coalescing | ||
""" | ||
def __init__(self) -> None: | ||
super().__init__() | ||
self.add_fit_requirements([ | ||
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), | ||
FitRequirement('categories', (List,), user_defined=True, dataset_property=True)]) | ||
|
||
def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: | ||
""" | ||
The input X is the fit dictionary, that contains both the train data as | ||
well as fit directives. For example, it indicates whether or not to use the gpu | ||
or perform a cpu only run. | ||
|
||
This method add the self into the 'X' dictionary and return it. | ||
Args: | ||
X (Dict[str, Any]): 'X' dictionary | ||
franchuterivera marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Returns: | ||
(Dict[str, Any]): the updated 'X' dictionary | ||
""" | ||
if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: | ||
franchuterivera marked this conversation as resolved.
Show resolved
Hide resolved
|
||
raise ValueError("Cannot call transform() on {} without calling fit() first." | ||
.format(self.__class__.__name__)) | ||
X.update({'coalescer': self.preprocessor}) | ||
return X |
Uh oh!
There was an error while loading. Please reload this page.