automl
diff --git a/‎autoPyTorch/api/base_task.py
Lines changed: 39 additions & 3 deletions b/‎autoPyTorch/api/base_task.py
Lines changed: 39 additions & 3 deletions
diff --git a/‎autoPyTorch/api/tabular_classification.py
Lines changed: 24 additions & 10 deletions b/‎autoPyTorch/api/tabular_classification.py
Lines changed: 24 additions & 10 deletions
diff --git a/‎autoPyTorch/api/tabular_regression.py
Lines changed: 23 additions & 10 deletions b/‎autoPyTorch/api/tabular_regression.py
Lines changed: 23 additions & 10 deletions
diff --git a/‎autoPyTorch/data/tabular_feature_validator.py
Lines changed: 1 addition & 8 deletions b/‎autoPyTorch/data/tabular_feature_validator.py
Lines changed: 1 addition & 8 deletions
diff --git a/‎autoPyTorch/data/tabular_validator.py
Lines changed: 9 additions & 6 deletions b/‎autoPyTorch/data/tabular_validator.py
Lines changed: 9 additions & 6 deletions
diff --git a/‎autoPyTorch/data/utils.py
Lines changed: 3 additions & 4 deletions b/‎autoPyTorch/data/utils.py
Lines changed: 3 additions & 4 deletions
@@ -12,7 +12,7 @@
 import unittest.mock
 import warnings
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
 
@@ -39,6 +39,7 @@
     STRING_TO_TASK_TYPES,
 )
 from autoPyTorch.data.base_validator import BaseInputValidator
+from autoPyTorch.data.utils import DatasetCompressionSpec
 from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
@@ -299,7 +300,7 @@ def _get_dataset_input_validator(
         resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
-        dataset_compression: Optional[Mapping[str, Any]] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
     ) -> Tuple[BaseDataset, BaseInputValidator]:
         """
         Returns an object of a child class of `BaseDataset` and
@@ -324,6 +325,9 @@ def _get_dataset_input_validator(
                 in ```datasets/resampling_strategy.py```.
             dataset_name (Optional[str]):
                 name of the dataset, used as experiment name.
+            dataset_compression (Optional[DatasetCompressionSpec]):
+                specifications for dataset compression. For more info check
+                documentation for `BaseTask.get_dataset`.
 
         Returns:
             BaseDataset:
@@ -342,7 +346,7 @@ def get_dataset(
         resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
-        dataset_compression: Optional[Mapping[str, Any]] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
     ) -> BaseDataset:
         """
         Returns an object of a child class of `BaseDataset` according to the current task.
@@ -365,6 +369,38 @@ def get_dataset(
                 in ```datasets/resampling_strategy.py```.
             dataset_name (Optional[str]):
                 name of the dataset, used as experiment name.
+            dataset_compression (Optional[DatasetCompressionSpec]):
+                We compress datasets so that they fit into some predefined amount of memory.
+                **NOTE**
+
+                You can also pass your own configuration with the same keys and choosing
+                from the available ``"methods"``.
+                The available options are described here:
+                **memory_allocation**
+                    Absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
+                    The memory used by the dataset is checked after each reduction method is
+                    performed. If the dataset fits into the allocated memory, any further methods
+                    listed in ``"methods"`` will not be performed.
+                    It can be either float or int.
+
+                **methods**
+                    We currently provide the following methods for reducing the dataset size.
+                    These can be provided in a list and are performed in the order as given.
+                    *   ``"precision"`` -
+                        We reduce floating point precision as follows:
+                            *   ``np.float128 -> np.float64``
+                            *   ``np.float96 -> np.float64``
+                            *   ``np.float64 -> np.float32``
+                            *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
+                                to the lowest possible precision.
+                    *   ``subsample`` -
+                        We subsample data such that it **fits directly into
+                        the memory allocation** ``memory_allocation * memory_limit``.
+                        Therefore, this should likely be the last method listed in
+                        ``"methods"``.
+                        Subsampling takes into account classification labels and stratifies
+                        accordingly. We guarantee that at least one occurrence of each
+                        label is included in the sampled set.
 
         Returns:
             BaseDataset:
 
@@ -12,7 +12,8 @@
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.utils import (
-    get_dataset_compression_mapping
+    DatasetCompressionSpec,
+    get_dataset_compression_mapping,
 )
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
@@ -166,7 +167,7 @@ def _get_dataset_input_validator(
         resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
-        dataset_compression: Optional[Mapping[str, Any]] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
     ) -> Tuple[TabularDataset, TabularInputValidator]:
         """
         Returns an object of `TabularDataset` and an object of
@@ -190,6 +191,10 @@ def _get_dataset_input_validator(
                 in ```datasets/resampling_strategy.py```.
             dataset_name (Optional[str]):
                 name of the dataset, used as experiment name.
+            dataset_compression (Optional[DatasetCompressionSpec]):
+                specifications for dataset compression. For more info check
+                documentation for `BaseTask.get_dataset`.
+
         Returns:
             TabularDataset:
                 the dataset object.
@@ -396,14 +401,23 @@ def search(
                     listed in ``"methods"`` will not be performed.
 
                 **methods**
-                We currently provide the following methods for reducing the dataset size.
-                These can be provided in a list and are performed in the order as given.
-                *   ``"precision"`` - We reduce floating point precision as follows:
-                    *   ``np.float128 -> np.float64``
-                    *   ``np.float96 -> np.float64``
-                    *   ``np.float64 -> np.float32``
-                    *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
-                        to the lowest possible precision.
+                    We currently provide the following methods for reducing the dataset size.
+                    These can be provided in a list and are performed in the order as given.
+                    *   ``"precision"`` -
+                        We reduce floating point precision as follows:
+                            *   ``np.float128 -> np.float64``
+                            *   ``np.float96 -> np.float64``
+                            *   ``np.float64 -> np.float32``
+                            *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
+                                to the lowest possible precision.
+                    *   ``subsample`` -
+                        We subsample data such that it **fits directly into
+                        the memory allocation** ``memory_allocation * memory_limit``.
+                        Therefore, this should likely be the last method listed in
+                        ``"methods"``.
+                        Subsampling takes into account classification labels and stratifies
+                        accordingly. We guarantee that at least one occurrence of each
+                        label is included in the sampled set.
 
         Returns:
             self
 
@@ -12,7 +12,8 @@
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.utils import (
-    get_dataset_compression_mapping
+    DatasetCompressionSpec,
+    get_dataset_compression_mapping,
 )
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
@@ -167,7 +168,7 @@ def _get_dataset_input_validator(
         resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
-        dataset_compression: Optional[Mapping[str, Any]] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
     ) -> Tuple[TabularDataset, TabularInputValidator]:
         """
         Returns an object of `TabularDataset` and an object of
@@ -191,6 +192,9 @@ def _get_dataset_input_validator(
                 in ```datasets/resampling_strategy.py```.
             dataset_name (Optional[str]):
                 name of the dataset, used as experiment name.
+            dataset_compression (Optional[DatasetCompressionSpec]):
+                specifications for dataset compression. For more info check
+                documentation for `BaseTask.get_dataset`.
         Returns:
             TabularDataset:
                 the dataset object.
@@ -397,14 +401,23 @@ def search(
                     listed in ``"methods"`` will not be performed.
 
                 **methods**
-                We currently provide the following methods for reducing the dataset size.
-                These can be provided in a list and are performed in the order as given.
-                *   ``"precision"`` - We reduce floating point precision as follows:
-                    *   ``np.float128 -> np.float64``
-                    *   ``np.float96 -> np.float64``
-                    *   ``np.float64 -> np.float32``
-                    *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
-                        to the lowest possible precision.
+                    We currently provide the following methods for reducing the dataset size.
+                    These can be provided in a list and are performed in the order as given.
+                    *   ``"precision"`` -
+                        We reduce floating point precision as follows:
+                            *   ``np.float128 -> np.float64``
+                            *   ``np.float96 -> np.float64``
+                            *   ``np.float64 -> np.float32``
+                            *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
+                                to the lowest possible precision.
+                    *   ``subsample`` -
+                        We subsample data such that it **fits directly into
+                        the memory allocation** ``memory_allocation * memory_limit``.
+                        Therefore, this should likely be the last method listed in
+                        ``"methods"``.
+                        Subsampling takes into account classification labels and stratifies
+                        accordingly. We guarantee that at least one occurrence of each
+                        label is included in the sampled set.
 
         Returns:
             self
 
@@ -1,6 +1,5 @@
 import functools
-from logging import Logger
-from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast
+from typing import Dict, List, Optional, Tuple, Union, cast
 
 import numpy as np
 
@@ -18,12 +17,6 @@
 from sklearn.pipeline import make_pipeline
 
 from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
-from autoPyTorch.data.utils import (
-    DatasetCompressionInputType,
-    DatasetDTypeContainerType,
-    reduce_dataset_size_if_too_large
-)
-from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
 def _create_column_transformer(
 
@@ -1,6 +1,6 @@
 # -*- encoding: utf-8 -*-
 import logging
-from typing import Any, Mapping, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 
@@ -11,6 +11,7 @@
 from autoPyTorch.data.tabular_target_validator import SupportedTargetTypes, TabularTargetValidator
 from autoPyTorch.data.utils import (
     DatasetCompressionInputType,
+    DatasetCompressionSpec,
     DatasetDTypeContainerType,
     reduce_dataset_size_if_too_large
 )
@@ -36,20 +37,22 @@ class TabularInputValidator(BaseInputValidator):
         target_validator (TargetValidator):
             A TargetValidator instance used to validate and encode (in case of classification)
             the target values
+        dataset_compression (Optional[DatasetCompressionSpec]):
+            specifications for dataset compression. For more info check
+            documentation for `BaseTask.get_dataset`.
     """
     def __init__(
         self,
         is_classification: bool = False,
         logger_port: Optional[int] = None,
-        dataset_compression: Optional[Mapping[str, Any]] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
         seed: int = 42,
     ) -> None:
         self._dataset_compression = dataset_compression
         self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
         self.is_classification = is_classification
         self.logger_port = logger_port
         self.seed = seed
-        self.dataset_compression = dataset_compression
         if self.logger_port is not None:
             self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
                 name='Validation',
@@ -66,7 +69,6 @@ def __init__(
         )
         self._is_fitted = False
 
-    # TODO: modify once we have added subsampling as well.
     def _compress_dataset(
         self,
         X: DatasetCompressionInputType,
@@ -76,7 +78,8 @@ def _compress_dataset(
         Compress the dataset. This function ensures that
         the testing data is converted to the same dtype as
         the training data.
-
+        See `autoPyTorch.data.utils.reduce_dataset_size_if_too_large`
+        for more information.
 
         Args:
             X (DatasetCompressionInputType):
@@ -100,7 +103,7 @@ def _compress_dataset(
                 y=y,
                 is_classification=self.is_classification,
                 random_state=self.seed,
-                **self._dataset_compression
+                **self._dataset_compression  # type: ignore [arg-type]
             )
             self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
             return X, y
 
@@ -1,6 +1,5 @@
 # Implementation used from https://github.com/automl/auto-sklearn/blob/development/autosklearn/util/data.py
 import warnings
-from math import floor
 from typing import (
     Any,
     Dict,
@@ -463,7 +462,7 @@ def megabytes(arr: DatasetCompressionInputType) -> float:
 
 def reduce_dataset_size_if_too_large(
     X: DatasetCompressionInputType,
-    memory_allocation: float,
+    memory_allocation: Union[int, float],
     is_classification: bool,
     random_state: Union[int, np.random.RandomState],
     y: Optional[SupportedTargetTypes] = None,
@@ -488,7 +487,7 @@ def reduce_dataset_size_if_too_large(
         X: DatasetCompressionInputType
             The features of the dataset.
 
-        methods: List[str] = ['precision', 'subsample']
+        methods (List[str] = ['precision', 'subsample']):
             A list of operations that are permitted to be performed to reduce
             the size of the dataset.
 
@@ -501,7 +500,7 @@ def reduce_dataset_size_if_too_large(
                 memory. Ensures stratification and that unique labels are present
 
 
-        memory_allocation: int
+        memory_allocation (Union[int, float]):
             The amount of memory to allocate to the dataset. It should specify an
             absolute amount.