Skip to content

Commit

Permalink
[ADD] Subsampling Dataset (#398)
Browse files Browse the repository at this point in the history
* initial implementation

* fix issue with missing classes

* finalise implementation, add documentation

* fix tests

* add tests from ask

* fix issues from feature preprocessing PR

* address comments from shuhei

* address comments from code review

* address comments from shuhei
  • Loading branch information
ravinkohli authored Mar 9, 2022
1 parent aa0eec5 commit bdd3fa8
Show file tree
Hide file tree
Showing 15 changed files with 760 additions and 293 deletions.
41 changes: 40 additions & 1 deletion autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
STRING_TO_TASK_TYPES,
)
from autoPyTorch.data.base_validator import BaseInputValidator
from autoPyTorch.data.utils import DatasetCompressionSpec
from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
from autoPyTorch.datasets.resampling_strategy import (
CrossValTypes,
Expand Down Expand Up @@ -299,6 +300,7 @@ def _get_dataset_input_validator(
resampling_strategy: Optional[ResamplingStrategies] = None,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
) -> Tuple[BaseDataset, BaseInputValidator]:
"""
Returns an object of a child class of `BaseDataset` and
Expand All @@ -323,6 +325,9 @@ def _get_dataset_input_validator(
in ```datasets/resampling_strategy.py```.
dataset_name (Optional[str]):
name of the dataset, used as experiment name.
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.
Returns:
BaseDataset:
Expand All @@ -341,6 +346,7 @@ def get_dataset(
resampling_strategy: Optional[ResamplingStrategies] = None,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
) -> BaseDataset:
"""
Returns an object of a child class of `BaseDataset` according to the current task.
Expand All @@ -363,6 +369,38 @@ def get_dataset(
in ```datasets/resampling_strategy.py```.
dataset_name (Optional[str]):
name of the dataset, used as experiment name.
dataset_compression (Optional[DatasetCompressionSpec]):
We compress datasets so that they fit into some predefined amount of memory.
**NOTE**
You can also pass your own configuration with the same keys and choosing
from the available ``"methods"``.
The available options are described here:
**memory_allocation**
Absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
The memory used by the dataset is checked after each reduction method is
performed. If the dataset fits into the allocated memory, any further methods
listed in ``"methods"`` will not be performed.
It can be either float or int.
**methods**
We currently provide the following methods for reducing the dataset size.
These can be provided in a list and are performed in the order as given.
* ``"precision"`` -
We reduce floating point precision as follows:
* ``np.float128 -> np.float64``
* ``np.float96 -> np.float64``
* ``np.float64 -> np.float32``
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
to the lowest possible precision.
* ``subsample`` -
We subsample data such that it **fits directly into
the memory allocation** ``memory_allocation * memory_limit``.
Therefore, this should likely be the last method listed in
``"methods"``.
Subsampling takes into account classification labels and stratifies
accordingly. We guarantee that at least one occurrence of each
label is included in the sampled set.
Returns:
BaseDataset:
Expand All @@ -375,7 +413,8 @@ def get_dataset(
y_test=y_test,
resampling_strategy=resampling_strategy,
resampling_strategy_args=resampling_strategy_args,
dataset_name=dataset_name)
dataset_name=dataset_name,
dataset_compression=dataset_compression)

return dataset

Expand Down
34 changes: 24 additions & 10 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
)
from autoPyTorch.data.tabular_validator import TabularInputValidator
from autoPyTorch.data.utils import (
get_dataset_compression_mapping
DatasetCompressionSpec,
get_dataset_compression_mapping,
)
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.datasets.resampling_strategy import (
Expand Down Expand Up @@ -166,7 +167,7 @@ def _get_dataset_input_validator(
resampling_strategy: Optional[ResamplingStrategies] = None,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[Mapping[str, Any]] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
) -> Tuple[TabularDataset, TabularInputValidator]:
"""
Returns an object of `TabularDataset` and an object of
Expand All @@ -190,6 +191,10 @@ def _get_dataset_input_validator(
in ```datasets/resampling_strategy.py```.
dataset_name (Optional[str]):
name of the dataset, used as experiment name.
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.
Returns:
TabularDataset:
the dataset object.
Expand Down Expand Up @@ -396,14 +401,23 @@ def search(
listed in ``"methods"`` will not be performed.
**methods**
We currently provide the following methods for reducing the dataset size.
These can be provided in a list and are performed in the order as given.
* ``"precision"`` - We reduce floating point precision as follows:
* ``np.float128 -> np.float64``
* ``np.float96 -> np.float64``
* ``np.float64 -> np.float32``
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
to the lowest possible precision.
We currently provide the following methods for reducing the dataset size.
These can be provided in a list and are performed in the order as given.
* ``"precision"`` -
We reduce floating point precision as follows:
* ``np.float128 -> np.float64``
* ``np.float96 -> np.float64``
* ``np.float64 -> np.float32``
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
to the lowest possible precision.
* ``subsample`` -
We subsample data such that it **fits directly into
the memory allocation** ``memory_allocation * memory_limit``.
Therefore, this should likely be the last method listed in
``"methods"``.
Subsampling takes into account classification labels and stratifies
accordingly. We guarantee that at least one occurrence of each
label is included in the sampled set.
Returns:
self
Expand Down
33 changes: 23 additions & 10 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
)
from autoPyTorch.data.tabular_validator import TabularInputValidator
from autoPyTorch.data.utils import (
get_dataset_compression_mapping
DatasetCompressionSpec,
get_dataset_compression_mapping,
)
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.datasets.resampling_strategy import (
Expand Down Expand Up @@ -167,7 +168,7 @@ def _get_dataset_input_validator(
resampling_strategy: Optional[ResamplingStrategies] = None,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[Mapping[str, Any]] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
) -> Tuple[TabularDataset, TabularInputValidator]:
"""
Returns an object of `TabularDataset` and an object of
Expand All @@ -191,6 +192,9 @@ def _get_dataset_input_validator(
in ```datasets/resampling_strategy.py```.
dataset_name (Optional[str]):
name of the dataset, used as experiment name.
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.
Returns:
TabularDataset:
the dataset object.
Expand Down Expand Up @@ -397,14 +401,23 @@ def search(
listed in ``"methods"`` will not be performed.
**methods**
We currently provide the following methods for reducing the dataset size.
These can be provided in a list and are performed in the order as given.
* ``"precision"`` - We reduce floating point precision as follows:
* ``np.float128 -> np.float64``
* ``np.float96 -> np.float64``
* ``np.float64 -> np.float32``
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
to the lowest possible precision.
We currently provide the following methods for reducing the dataset size.
These can be provided in a list and are performed in the order as given.
* ``"precision"`` -
We reduce floating point precision as follows:
* ``np.float128 -> np.float64``
* ``np.float96 -> np.float64``
* ``np.float64 -> np.float32``
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
to the lowest possible precision.
* ``subsample`` -
We subsample data such that it **fits directly into
the memory allocation** ``memory_allocation * memory_limit``.
Therefore, this should likely be the last method listed in
``"methods"``.
Subsampling takes into account classification labels and stratifies
accordingly. We guarantee that at least one occurrence of each
label is included in the sampled set.
Returns:
self
Expand Down
42 changes: 2 additions & 40 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import functools
from logging import Logger
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast
from typing import Dict, List, Optional, Tuple, Union, cast

import numpy as np

Expand All @@ -18,11 +18,6 @@
from sklearn.pipeline import make_pipeline

from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
from autoPyTorch.data.utils import (
DatasetCompressionInputType,
DatasetDTypeContainerType,
reduce_dataset_size_if_too_large
)
from autoPyTorch.utils.common import ispandas
from autoPyTorch.utils.logging_ import PicklableClientLogger

Expand Down Expand Up @@ -103,10 +98,7 @@ class TabularFeatureValidator(BaseFeatureValidator):
def __init__(
self,
logger: Optional[Union[PicklableClientLogger, Logger]] = None,
dataset_compression: Optional[Mapping[str, Any]] = None,
) -> None:
self._dataset_compression = dataset_compression
self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
):
super().__init__(logger)

@staticmethod
Expand Down Expand Up @@ -290,38 +282,8 @@ def transform(
"numerical or categorical values.")
raise e

X = self._compress_dataset(X)

return X

# TODO: modify once we have added subsampling as well.
def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressionInputType:
"""
Compress the dataset. This function ensures that
the testing data is converted to the same dtype as
the training data.
Args:
X (DatasetCompressionInputType):
Dataset
Returns:
DatasetCompressionInputType:
Compressed dataset.
"""
is_dataframe = ispandas(X)
is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
if not is_reducible_type or self._dataset_compression is None:
return X
elif self._reduced_dtype is not None:
X = X.astype(self._reduced_dtype)
return X
else:
X = reduce_dataset_size_if_too_large(X, **self._dataset_compression)
self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
return X

def _check_data(
self,
X: SupportedFeatTypes,
Expand Down
80 changes: 73 additions & 7 deletions autoPyTorch/data/tabular_validator.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
# -*- encoding: utf-8 -*-
import logging
from typing import Any, Mapping, Optional, Union
from typing import Optional, Tuple, Union

import numpy as np

from scipy.sparse import issparse

from autoPyTorch.data.base_validator import BaseInputValidator
from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
from autoPyTorch.data.tabular_feature_validator import SupportedFeatTypes, TabularFeatureValidator
from autoPyTorch.data.tabular_target_validator import SupportedTargetTypes, TabularTargetValidator
from autoPyTorch.data.utils import (
DatasetCompressionInputType,
DatasetCompressionSpec,
DatasetDTypeContainerType,
reduce_dataset_size_if_too_large
)
from autoPyTorch.utils.common import ispandas
from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger


Expand All @@ -27,16 +38,22 @@ class TabularInputValidator(BaseInputValidator):
target_validator (TargetValidator):
A TargetValidator instance used to validate and encode (in case of classification)
the target values
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.
"""
def __init__(
self,
is_classification: bool = False,
logger_port: Optional[int] = None,
dataset_compression: Optional[Mapping[str, Any]] = None,
) -> None:
dataset_compression: Optional[DatasetCompressionSpec] = None,
seed: int = 42,
):
self.dataset_compression = dataset_compression
self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
self.is_classification = is_classification
self.logger_port = logger_port
self.dataset_compression = dataset_compression
self.seed = seed
if self.logger_port is not None:
self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
name='Validation',
Expand All @@ -46,10 +63,59 @@ def __init__(
self.logger = logging.getLogger('Validation')

self.feature_validator = TabularFeatureValidator(
dataset_compression=self.dataset_compression,
logger=self.logger)
self.target_validator = TabularTargetValidator(
is_classification=self.is_classification,
logger=self.logger
)
self._is_fitted = False

def _compress_dataset(
self,
X: DatasetCompressionInputType,
y: SupportedTargetTypes,
) -> DatasetCompressionInputType:
"""
Compress the dataset. This function ensures that
the testing data is converted to the same dtype as
the training data.
See `autoPyTorch.data.utils.reduce_dataset_size_if_too_large`
for more information.
Args:
X (DatasetCompressionInputType):
features of dataset
y (SupportedTargetTypes):
targets of dataset
Returns:
DatasetCompressionInputType:
Compressed dataset.
"""
is_dataframe = ispandas(X)
is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
if not is_reducible_type or self.dataset_compression is None:
return X, y
elif self._reduced_dtype is not None:
X = X.astype(self._reduced_dtype)
return X, y
else:
X, y = reduce_dataset_size_if_too_large(
X,
y=y,
is_classification=self.is_classification,
random_state=self.seed,
**self.dataset_compression # type: ignore [arg-type]
)
self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
return X, y

def transform(
self,
X: SupportedFeatTypes,
y: Optional[SupportedTargetTypes] = None,
) -> Tuple[np.ndarray, Optional[np.ndarray]]:

X, y = super().transform(X, y)
X_reduced, y_reduced = self._compress_dataset(X, y)

return X_reduced, y_reduced
Loading

0 comments on commit bdd3fa8

Please sign in to comment.