Skip to content

Commit bdd3fa8

Browse files
authored
[ADD] Subsampling Dataset (#398)
* initial implementation * fix issue with missing classes * finalise implementation, add documentation * fix tests * add tests from ask * fix issues from feature preprocessing PR * address comments from shuhei * address comments from code review * address comments from shuhei
1 parent aa0eec5 commit bdd3fa8

File tree

15 files changed

+760
-293
lines changed

15 files changed

+760
-293
lines changed

autoPyTorch/api/base_task.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
STRING_TO_TASK_TYPES,
4040
)
4141
from autoPyTorch.data.base_validator import BaseInputValidator
42+
from autoPyTorch.data.utils import DatasetCompressionSpec
4243
from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
4344
from autoPyTorch.datasets.resampling_strategy import (
4445
CrossValTypes,
@@ -299,6 +300,7 @@ def _get_dataset_input_validator(
299300
resampling_strategy: Optional[ResamplingStrategies] = None,
300301
resampling_strategy_args: Optional[Dict[str, Any]] = None,
301302
dataset_name: Optional[str] = None,
303+
dataset_compression: Optional[DatasetCompressionSpec] = None,
302304
) -> Tuple[BaseDataset, BaseInputValidator]:
303305
"""
304306
Returns an object of a child class of `BaseDataset` and
@@ -323,6 +325,9 @@ def _get_dataset_input_validator(
323325
in ```datasets/resampling_strategy.py```.
324326
dataset_name (Optional[str]):
325327
name of the dataset, used as experiment name.
328+
dataset_compression (Optional[DatasetCompressionSpec]):
329+
specifications for dataset compression. For more info check
330+
documentation for `BaseTask.get_dataset`.
326331
327332
Returns:
328333
BaseDataset:
@@ -341,6 +346,7 @@ def get_dataset(
341346
resampling_strategy: Optional[ResamplingStrategies] = None,
342347
resampling_strategy_args: Optional[Dict[str, Any]] = None,
343348
dataset_name: Optional[str] = None,
349+
dataset_compression: Optional[DatasetCompressionSpec] = None,
344350
) -> BaseDataset:
345351
"""
346352
Returns an object of a child class of `BaseDataset` according to the current task.
@@ -363,6 +369,38 @@ def get_dataset(
363369
in ```datasets/resampling_strategy.py```.
364370
dataset_name (Optional[str]):
365371
name of the dataset, used as experiment name.
372+
dataset_compression (Optional[DatasetCompressionSpec]):
373+
We compress datasets so that they fit into some predefined amount of memory.
374+
**NOTE**
375+
376+
You can also pass your own configuration with the same keys and choosing
377+
from the available ``"methods"``.
378+
The available options are described here:
379+
**memory_allocation**
380+
Absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
381+
The memory used by the dataset is checked after each reduction method is
382+
performed. If the dataset fits into the allocated memory, any further methods
383+
listed in ``"methods"`` will not be performed.
384+
It can be either float or int.
385+
386+
**methods**
387+
We currently provide the following methods for reducing the dataset size.
388+
These can be provided in a list and are performed in the order as given.
389+
* ``"precision"`` -
390+
We reduce floating point precision as follows:
391+
* ``np.float128 -> np.float64``
392+
* ``np.float96 -> np.float64``
393+
* ``np.float64 -> np.float32``
394+
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
395+
to the lowest possible precision.
396+
* ``subsample`` -
397+
We subsample data such that it **fits directly into
398+
the memory allocation** ``memory_allocation * memory_limit``.
399+
Therefore, this should likely be the last method listed in
400+
``"methods"``.
401+
Subsampling takes into account classification labels and stratifies
402+
accordingly. We guarantee that at least one occurrence of each
403+
label is included in the sampled set.
366404
367405
Returns:
368406
BaseDataset:
@@ -375,7 +413,8 @@ def get_dataset(
375413
y_test=y_test,
376414
resampling_strategy=resampling_strategy,
377415
resampling_strategy_args=resampling_strategy_args,
378-
dataset_name=dataset_name)
416+
dataset_name=dataset_name,
417+
dataset_compression=dataset_compression)
379418

380419
return dataset
381420

autoPyTorch/api/tabular_classification.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
)
1313
from autoPyTorch.data.tabular_validator import TabularInputValidator
1414
from autoPyTorch.data.utils import (
15-
get_dataset_compression_mapping
15+
DatasetCompressionSpec,
16+
get_dataset_compression_mapping,
1617
)
1718
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1819
from autoPyTorch.datasets.resampling_strategy import (
@@ -166,7 +167,7 @@ def _get_dataset_input_validator(
166167
resampling_strategy: Optional[ResamplingStrategies] = None,
167168
resampling_strategy_args: Optional[Dict[str, Any]] = None,
168169
dataset_name: Optional[str] = None,
169-
dataset_compression: Optional[Mapping[str, Any]] = None,
170+
dataset_compression: Optional[DatasetCompressionSpec] = None,
170171
) -> Tuple[TabularDataset, TabularInputValidator]:
171172
"""
172173
Returns an object of `TabularDataset` and an object of
@@ -190,6 +191,10 @@ def _get_dataset_input_validator(
190191
in ```datasets/resampling_strategy.py```.
191192
dataset_name (Optional[str]):
192193
name of the dataset, used as experiment name.
194+
dataset_compression (Optional[DatasetCompressionSpec]):
195+
specifications for dataset compression. For more info check
196+
documentation for `BaseTask.get_dataset`.
197+
193198
Returns:
194199
TabularDataset:
195200
the dataset object.
@@ -396,14 +401,23 @@ def search(
396401
listed in ``"methods"`` will not be performed.
397402
398403
**methods**
399-
We currently provide the following methods for reducing the dataset size.
400-
These can be provided in a list and are performed in the order as given.
401-
* ``"precision"`` - We reduce floating point precision as follows:
402-
* ``np.float128 -> np.float64``
403-
* ``np.float96 -> np.float64``
404-
* ``np.float64 -> np.float32``
405-
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
406-
to the lowest possible precision.
404+
We currently provide the following methods for reducing the dataset size.
405+
These can be provided in a list and are performed in the order as given.
406+
* ``"precision"`` -
407+
We reduce floating point precision as follows:
408+
* ``np.float128 -> np.float64``
409+
* ``np.float96 -> np.float64``
410+
* ``np.float64 -> np.float32``
411+
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
412+
to the lowest possible precision.
413+
* ``subsample`` -
414+
We subsample data such that it **fits directly into
415+
the memory allocation** ``memory_allocation * memory_limit``.
416+
Therefore, this should likely be the last method listed in
417+
``"methods"``.
418+
Subsampling takes into account classification labels and stratifies
419+
accordingly. We guarantee that at least one occurrence of each
420+
label is included in the sampled set.
407421
408422
Returns:
409423
self

autoPyTorch/api/tabular_regression.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
)
1313
from autoPyTorch.data.tabular_validator import TabularInputValidator
1414
from autoPyTorch.data.utils import (
15-
get_dataset_compression_mapping
15+
DatasetCompressionSpec,
16+
get_dataset_compression_mapping,
1617
)
1718
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1819
from autoPyTorch.datasets.resampling_strategy import (
@@ -167,7 +168,7 @@ def _get_dataset_input_validator(
167168
resampling_strategy: Optional[ResamplingStrategies] = None,
168169
resampling_strategy_args: Optional[Dict[str, Any]] = None,
169170
dataset_name: Optional[str] = None,
170-
dataset_compression: Optional[Mapping[str, Any]] = None,
171+
dataset_compression: Optional[DatasetCompressionSpec] = None,
171172
) -> Tuple[TabularDataset, TabularInputValidator]:
172173
"""
173174
Returns an object of `TabularDataset` and an object of
@@ -191,6 +192,9 @@ def _get_dataset_input_validator(
191192
in ```datasets/resampling_strategy.py```.
192193
dataset_name (Optional[str]):
193194
name of the dataset, used as experiment name.
195+
dataset_compression (Optional[DatasetCompressionSpec]):
196+
specifications for dataset compression. For more info check
197+
documentation for `BaseTask.get_dataset`.
194198
Returns:
195199
TabularDataset:
196200
the dataset object.
@@ -397,14 +401,23 @@ def search(
397401
listed in ``"methods"`` will not be performed.
398402
399403
**methods**
400-
We currently provide the following methods for reducing the dataset size.
401-
These can be provided in a list and are performed in the order as given.
402-
* ``"precision"`` - We reduce floating point precision as follows:
403-
* ``np.float128 -> np.float64``
404-
* ``np.float96 -> np.float64``
405-
* ``np.float64 -> np.float32``
406-
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
407-
to the lowest possible precision.
404+
We currently provide the following methods for reducing the dataset size.
405+
These can be provided in a list and are performed in the order as given.
406+
* ``"precision"`` -
407+
We reduce floating point precision as follows:
408+
* ``np.float128 -> np.float64``
409+
* ``np.float96 -> np.float64``
410+
* ``np.float64 -> np.float32``
411+
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
412+
to the lowest possible precision.
413+
* ``subsample`` -
414+
We subsample data such that it **fits directly into
415+
the memory allocation** ``memory_allocation * memory_limit``.
416+
Therefore, this should likely be the last method listed in
417+
``"methods"``.
418+
Subsampling takes into account classification labels and stratifies
419+
accordingly. We guarantee that at least one occurrence of each
420+
label is included in the sampled set.
408421
409422
Returns:
410423
self

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 2 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import functools
22
from logging import Logger
3-
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast
3+
from typing import Dict, List, Optional, Tuple, Union, cast
44

55
import numpy as np
66

@@ -18,11 +18,6 @@
1818
from sklearn.pipeline import make_pipeline
1919

2020
from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
21-
from autoPyTorch.data.utils import (
22-
DatasetCompressionInputType,
23-
DatasetDTypeContainerType,
24-
reduce_dataset_size_if_too_large
25-
)
2621
from autoPyTorch.utils.common import ispandas
2722
from autoPyTorch.utils.logging_ import PicklableClientLogger
2823

@@ -103,10 +98,7 @@ class TabularFeatureValidator(BaseFeatureValidator):
10398
def __init__(
10499
self,
105100
logger: Optional[Union[PicklableClientLogger, Logger]] = None,
106-
dataset_compression: Optional[Mapping[str, Any]] = None,
107-
) -> None:
108-
self._dataset_compression = dataset_compression
109-
self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
101+
):
110102
super().__init__(logger)
111103

112104
@staticmethod
@@ -290,38 +282,8 @@ def transform(
290282
"numerical or categorical values.")
291283
raise e
292284

293-
X = self._compress_dataset(X)
294-
295285
return X
296286

297-
# TODO: modify once we have added subsampling as well.
298-
def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressionInputType:
299-
"""
300-
Compress the dataset. This function ensures that
301-
the testing data is converted to the same dtype as
302-
the training data.
303-
304-
305-
Args:
306-
X (DatasetCompressionInputType):
307-
Dataset
308-
309-
Returns:
310-
DatasetCompressionInputType:
311-
Compressed dataset.
312-
"""
313-
is_dataframe = ispandas(X)
314-
is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
315-
if not is_reducible_type or self._dataset_compression is None:
316-
return X
317-
elif self._reduced_dtype is not None:
318-
X = X.astype(self._reduced_dtype)
319-
return X
320-
else:
321-
X = reduce_dataset_size_if_too_large(X, **self._dataset_compression)
322-
self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
323-
return X
324-
325287
def _check_data(
326288
self,
327289
X: SupportedFeatTypes,

autoPyTorch/data/tabular_validator.py

Lines changed: 73 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,21 @@
11
# -*- encoding: utf-8 -*-
22
import logging
3-
from typing import Any, Mapping, Optional, Union
3+
from typing import Optional, Tuple, Union
4+
5+
import numpy as np
6+
7+
from scipy.sparse import issparse
48

59
from autoPyTorch.data.base_validator import BaseInputValidator
6-
from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
7-
from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
10+
from autoPyTorch.data.tabular_feature_validator import SupportedFeatTypes, TabularFeatureValidator
11+
from autoPyTorch.data.tabular_target_validator import SupportedTargetTypes, TabularTargetValidator
12+
from autoPyTorch.data.utils import (
13+
DatasetCompressionInputType,
14+
DatasetCompressionSpec,
15+
DatasetDTypeContainerType,
16+
reduce_dataset_size_if_too_large
17+
)
18+
from autoPyTorch.utils.common import ispandas
819
from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
920

1021

@@ -27,16 +38,22 @@ class TabularInputValidator(BaseInputValidator):
2738
target_validator (TargetValidator):
2839
A TargetValidator instance used to validate and encode (in case of classification)
2940
the target values
41+
dataset_compression (Optional[DatasetCompressionSpec]):
42+
specifications for dataset compression. For more info check
43+
documentation for `BaseTask.get_dataset`.
3044
"""
3145
def __init__(
3246
self,
3347
is_classification: bool = False,
3448
logger_port: Optional[int] = None,
35-
dataset_compression: Optional[Mapping[str, Any]] = None,
36-
) -> None:
49+
dataset_compression: Optional[DatasetCompressionSpec] = None,
50+
seed: int = 42,
51+
):
52+
self.dataset_compression = dataset_compression
53+
self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
3754
self.is_classification = is_classification
3855
self.logger_port = logger_port
39-
self.dataset_compression = dataset_compression
56+
self.seed = seed
4057
if self.logger_port is not None:
4158
self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
4259
name='Validation',
@@ -46,10 +63,59 @@ def __init__(
4663
self.logger = logging.getLogger('Validation')
4764

4865
self.feature_validator = TabularFeatureValidator(
49-
dataset_compression=self.dataset_compression,
5066
logger=self.logger)
5167
self.target_validator = TabularTargetValidator(
5268
is_classification=self.is_classification,
5369
logger=self.logger
5470
)
5571
self._is_fitted = False
72+
73+
def _compress_dataset(
74+
self,
75+
X: DatasetCompressionInputType,
76+
y: SupportedTargetTypes,
77+
) -> DatasetCompressionInputType:
78+
"""
79+
Compress the dataset. This function ensures that
80+
the testing data is converted to the same dtype as
81+
the training data.
82+
See `autoPyTorch.data.utils.reduce_dataset_size_if_too_large`
83+
for more information.
84+
85+
Args:
86+
X (DatasetCompressionInputType):
87+
features of dataset
88+
y (SupportedTargetTypes):
89+
targets of dataset
90+
Returns:
91+
DatasetCompressionInputType:
92+
Compressed dataset.
93+
"""
94+
is_dataframe = ispandas(X)
95+
is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
96+
if not is_reducible_type or self.dataset_compression is None:
97+
return X, y
98+
elif self._reduced_dtype is not None:
99+
X = X.astype(self._reduced_dtype)
100+
return X, y
101+
else:
102+
X, y = reduce_dataset_size_if_too_large(
103+
X,
104+
y=y,
105+
is_classification=self.is_classification,
106+
random_state=self.seed,
107+
**self.dataset_compression # type: ignore [arg-type]
108+
)
109+
self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
110+
return X, y
111+
112+
def transform(
113+
self,
114+
X: SupportedFeatTypes,
115+
y: Optional[SupportedTargetTypes] = None,
116+
) -> Tuple[np.ndarray, Optional[np.ndarray]]:
117+
118+
X, y = super().transform(X, y)
119+
X_reduced, y_reduced = self._compress_dataset(X, y)
120+
121+
return X_reduced, y_reduced

0 commit comments

Comments
 (0)