Skip to content

Commit e36ae4a

Browse files
committed
finalise implementation, add documentation
1 parent 606b231 commit e36ae4a

11 files changed

+311
-245
lines changed

autoPyTorch/api/base_task.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import unittest.mock
1313
import warnings
1414
from abc import ABC, abstractmethod
15-
from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
15+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
1616

1717
from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
1818

@@ -39,6 +39,7 @@
3939
STRING_TO_TASK_TYPES,
4040
)
4141
from autoPyTorch.data.base_validator import BaseInputValidator
42+
from autoPyTorch.data.utils import DatasetCompressionSpec
4243
from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
4344
from autoPyTorch.datasets.resampling_strategy import (
4445
CrossValTypes,
@@ -299,7 +300,7 @@ def _get_dataset_input_validator(
299300
resampling_strategy: Optional[ResamplingStrategies] = None,
300301
resampling_strategy_args: Optional[Dict[str, Any]] = None,
301302
dataset_name: Optional[str] = None,
302-
dataset_compression: Optional[Mapping[str, Any]] = None,
303+
dataset_compression: Optional[DatasetCompressionSpec] = None,
303304
) -> Tuple[BaseDataset, BaseInputValidator]:
304305
"""
305306
Returns an object of a child class of `BaseDataset` and
@@ -324,6 +325,9 @@ def _get_dataset_input_validator(
324325
in ```datasets/resampling_strategy.py```.
325326
dataset_name (Optional[str]):
326327
name of the dataset, used as experiment name.
328+
dataset_compression (Optional[DatasetCompressionSpec]):
329+
specifications for dataset compression. For more info check
330+
documentation for `BaseTask.get_dataset`.
327331
328332
Returns:
329333
BaseDataset:
@@ -342,7 +346,7 @@ def get_dataset(
342346
resampling_strategy: Optional[ResamplingStrategies] = None,
343347
resampling_strategy_args: Optional[Dict[str, Any]] = None,
344348
dataset_name: Optional[str] = None,
345-
dataset_compression: Optional[Mapping[str, Any]] = None,
349+
dataset_compression: Optional[DatasetCompressionSpec] = None,
346350
) -> BaseDataset:
347351
"""
348352
Returns an object of a child class of `BaseDataset` according to the current task.
@@ -365,6 +369,38 @@ def get_dataset(
365369
in ```datasets/resampling_strategy.py```.
366370
dataset_name (Optional[str]):
367371
name of the dataset, used as experiment name.
372+
dataset_compression (Optional[DatasetCompressionSpec]):
373+
We compress datasets so that they fit into some predefined amount of memory.
374+
**NOTE**
375+
376+
You can also pass your own configuration with the same keys and choosing
377+
from the available ``"methods"``.
378+
The available options are described here:
379+
**memory_allocation**
380+
Absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
381+
The memory used by the dataset is checked after each reduction method is
382+
performed. If the dataset fits into the allocated memory, any further methods
383+
listed in ``"methods"`` will not be performed.
384+
It can be either float or int.
385+
386+
**methods**
387+
We currently provide the following methods for reducing the dataset size.
388+
These can be provided in a list and are performed in the order as given.
389+
* ``"precision"`` -
390+
We reduce floating point precision as follows:
391+
* ``np.float128 -> np.float64``
392+
* ``np.float96 -> np.float64``
393+
* ``np.float64 -> np.float32``
394+
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
395+
to the lowest possible precision.
396+
* ``subsample`` -
397+
We subsample data such that it **fits directly into
398+
the memory allocation** ``memory_allocation * memory_limit``.
399+
Therefore, this should likely be the last method listed in
400+
``"methods"``.
401+
Subsampling takes into account classification labels and stratifies
402+
accordingly. We guarantee that at least one occurrence of each
403+
label is included in the sampled set.
368404
369405
Returns:
370406
BaseDataset:

autoPyTorch/api/tabular_classification.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
)
1313
from autoPyTorch.data.tabular_validator import TabularInputValidator
1414
from autoPyTorch.data.utils import (
15-
get_dataset_compression_mapping
15+
DatasetCompressionSpec,
16+
get_dataset_compression_mapping,
1617
)
1718
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1819
from autoPyTorch.datasets.resampling_strategy import (
@@ -166,7 +167,7 @@ def _get_dataset_input_validator(
166167
resampling_strategy: Optional[ResamplingStrategies] = None,
167168
resampling_strategy_args: Optional[Dict[str, Any]] = None,
168169
dataset_name: Optional[str] = None,
169-
dataset_compression: Optional[Mapping[str, Any]] = None,
170+
dataset_compression: Optional[DatasetCompressionSpec] = None,
170171
) -> Tuple[TabularDataset, TabularInputValidator]:
171172
"""
172173
Returns an object of `TabularDataset` and an object of
@@ -190,6 +191,10 @@ def _get_dataset_input_validator(
190191
in ```datasets/resampling_strategy.py```.
191192
dataset_name (Optional[str]):
192193
name of the dataset, used as experiment name.
194+
dataset_compression (Optional[DatasetCompressionSpec]):
195+
specifications for dataset compression. For more info check
196+
documentation for `BaseTask.get_dataset`.
197+
193198
Returns:
194199
TabularDataset:
195200
the dataset object.
@@ -396,14 +401,23 @@ def search(
396401
listed in ``"methods"`` will not be performed.
397402
398403
**methods**
399-
We currently provide the following methods for reducing the dataset size.
400-
These can be provided in a list and are performed in the order as given.
401-
* ``"precision"`` - We reduce floating point precision as follows:
402-
* ``np.float128 -> np.float64``
403-
* ``np.float96 -> np.float64``
404-
* ``np.float64 -> np.float32``
405-
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
406-
to the lowest possible precision.
404+
We currently provide the following methods for reducing the dataset size.
405+
These can be provided in a list and are performed in the order as given.
406+
* ``"precision"`` -
407+
We reduce floating point precision as follows:
408+
* ``np.float128 -> np.float64``
409+
* ``np.float96 -> np.float64``
410+
* ``np.float64 -> np.float32``
411+
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
412+
to the lowest possible precision.
413+
* ``subsample`` -
414+
We subsample data such that it **fits directly into
415+
the memory allocation** ``memory_allocation * memory_limit``.
416+
Therefore, this should likely be the last method listed in
417+
``"methods"``.
418+
Subsampling takes into account classification labels and stratifies
419+
accordingly. We guarantee that at least one occurrence of each
420+
label is included in the sampled set.
407421
408422
Returns:
409423
self

autoPyTorch/api/tabular_regression.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
)
1313
from autoPyTorch.data.tabular_validator import TabularInputValidator
1414
from autoPyTorch.data.utils import (
15-
get_dataset_compression_mapping
15+
DatasetCompressionSpec,
16+
get_dataset_compression_mapping,
1617
)
1718
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1819
from autoPyTorch.datasets.resampling_strategy import (
@@ -167,7 +168,7 @@ def _get_dataset_input_validator(
167168
resampling_strategy: Optional[ResamplingStrategies] = None,
168169
resampling_strategy_args: Optional[Dict[str, Any]] = None,
169170
dataset_name: Optional[str] = None,
170-
dataset_compression: Optional[Mapping[str, Any]] = None,
171+
dataset_compression: Optional[DatasetCompressionSpec] = None,
171172
) -> Tuple[TabularDataset, TabularInputValidator]:
172173
"""
173174
Returns an object of `TabularDataset` and an object of
@@ -191,6 +192,9 @@ def _get_dataset_input_validator(
191192
in ```datasets/resampling_strategy.py```.
192193
dataset_name (Optional[str]):
193194
name of the dataset, used as experiment name.
195+
dataset_compression (Optional[DatasetCompressionSpec]):
196+
specifications for dataset compression. For more info check
197+
documentation for `BaseTask.get_dataset`.
194198
Returns:
195199
TabularDataset:
196200
the dataset object.
@@ -397,14 +401,23 @@ def search(
397401
listed in ``"methods"`` will not be performed.
398402
399403
**methods**
400-
We currently provide the following methods for reducing the dataset size.
401-
These can be provided in a list and are performed in the order as given.
402-
* ``"precision"`` - We reduce floating point precision as follows:
403-
* ``np.float128 -> np.float64``
404-
* ``np.float96 -> np.float64``
405-
* ``np.float64 -> np.float32``
406-
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
407-
to the lowest possible precision.
404+
We currently provide the following methods for reducing the dataset size.
405+
These can be provided in a list and are performed in the order as given.
406+
* ``"precision"`` -
407+
We reduce floating point precision as follows:
408+
* ``np.float128 -> np.float64``
409+
* ``np.float96 -> np.float64``
410+
* ``np.float64 -> np.float32``
411+
* pandas dataframes are reduced using the downcast option of `pd.to_numeric`
412+
to the lowest possible precision.
413+
* ``subsample`` -
414+
We subsample data such that it **fits directly into
415+
the memory allocation** ``memory_allocation * memory_limit``.
416+
Therefore, this should likely be the last method listed in
417+
``"methods"``.
418+
Subsampling takes into account classification labels and stratifies
419+
accordingly. We guarantee that at least one occurrence of each
420+
label is included in the sampled set.
408421
409422
Returns:
410423
self

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import functools
2-
from logging import Logger
3-
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast
2+
from typing import Dict, List, Optional, Tuple, Union, cast
43

54
import numpy as np
65

@@ -18,12 +17,6 @@
1817
from sklearn.pipeline import make_pipeline
1918

2019
from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
21-
from autoPyTorch.data.utils import (
22-
DatasetCompressionInputType,
23-
DatasetDTypeContainerType,
24-
reduce_dataset_size_if_too_large
25-
)
26-
from autoPyTorch.utils.logging_ import PicklableClientLogger
2720

2821

2922
def _create_column_transformer(

autoPyTorch/data/tabular_validator.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- encoding: utf-8 -*-
22
import logging
3-
from typing import Any, Mapping, Optional, Tuple, Union
3+
from typing import Optional, Tuple, Union
44

55
import numpy as np
66

@@ -11,6 +11,7 @@
1111
from autoPyTorch.data.tabular_target_validator import SupportedTargetTypes, TabularTargetValidator
1212
from autoPyTorch.data.utils import (
1313
DatasetCompressionInputType,
14+
DatasetCompressionSpec,
1415
DatasetDTypeContainerType,
1516
reduce_dataset_size_if_too_large
1617
)
@@ -36,20 +37,22 @@ class TabularInputValidator(BaseInputValidator):
3637
target_validator (TargetValidator):
3738
A TargetValidator instance used to validate and encode (in case of classification)
3839
the target values
40+
dataset_compression (Optional[DatasetCompressionSpec]):
41+
specifications for dataset compression. For more info check
42+
documentation for `BaseTask.get_dataset`.
3943
"""
4044
def __init__(
4145
self,
4246
is_classification: bool = False,
4347
logger_port: Optional[int] = None,
44-
dataset_compression: Optional[Mapping[str, Any]] = None,
48+
dataset_compression: Optional[DatasetCompressionSpec] = None,
4549
seed: int = 42,
4650
) -> None:
4751
self._dataset_compression = dataset_compression
4852
self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
4953
self.is_classification = is_classification
5054
self.logger_port = logger_port
5155
self.seed = seed
52-
self.dataset_compression = dataset_compression
5356
if self.logger_port is not None:
5457
self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
5558
name='Validation',
@@ -66,7 +69,6 @@ def __init__(
6669
)
6770
self._is_fitted = False
6871

69-
# TODO: modify once we have added subsampling as well.
7072
def _compress_dataset(
7173
self,
7274
X: DatasetCompressionInputType,
@@ -76,7 +78,8 @@ def _compress_dataset(
7678
Compress the dataset. This function ensures that
7779
the testing data is converted to the same dtype as
7880
the training data.
79-
81+
See `autoPyTorch.data.utils.reduce_dataset_size_if_too_large`
82+
for more information.
8083
8184
Args:
8285
X (DatasetCompressionInputType):
@@ -100,7 +103,7 @@ def _compress_dataset(
100103
y=y,
101104
is_classification=self.is_classification,
102105
random_state=self.seed,
103-
**self._dataset_compression
106+
**self._dataset_compression # type: ignore [arg-type]
104107
)
105108
self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
106109
return X, y

autoPyTorch/data/utils.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# Implementation used from https://github.com/automl/auto-sklearn/blob/development/autosklearn/util/data.py
22
import warnings
3-
from math import floor
43
from typing import (
54
Any,
65
Dict,
@@ -463,7 +462,7 @@ def megabytes(arr: DatasetCompressionInputType) -> float:
463462

464463
def reduce_dataset_size_if_too_large(
465464
X: DatasetCompressionInputType,
466-
memory_allocation: float,
465+
memory_allocation: Union[int, float],
467466
is_classification: bool,
468467
random_state: Union[int, np.random.RandomState],
469468
y: Optional[SupportedTargetTypes] = None,
@@ -488,7 +487,7 @@ def reduce_dataset_size_if_too_large(
488487
X: DatasetCompressionInputType
489488
The features of the dataset.
490489
491-
methods: List[str] = ['precision', 'subsample']
490+
methods (List[str] = ['precision', 'subsample']):
492491
A list of operations that are permitted to be performed to reduce
493492
the size of the dataset.
494493
@@ -501,7 +500,7 @@ def reduce_dataset_size_if_too_large(
501500
memory. Ensures stratification and that unique labels are present
502501
503502
504-
memory_allocation: int
503+
memory_allocation (Union[int, float]):
505504
The amount of memory to allocate to the dataset. It should specify an
506505
absolute amount.
507506

0 commit comments

Comments
 (0)