Merge pull request #181 from Oxid15/utils_restructure

Oxid15 · web-flow · commit 2890fe386b61 · 2023-07-07T11:16:12.000+03:00
Utils restructure
diff --git a/cascade/utils/baselines/__init__.py b/cascade/utils/baselines/__init__.py
@@ -0,0 +1,17 @@
+"""
+Copyright 2022-2023 Ilia Moiseev
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from .constant_baseline import ConstantBaseline
diff --git a/cascade/utils/baselines/constant_baseline.py b/cascade/utils/baselines/constant_baseline.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 
-from ..models import BasicModel
+from ...models import BasicModel
 
 Number = Union[int, float, complex, np.number]
 
diff --git a/cascade/utils/nlp/__init__.py b/cascade/utils/nlp/__init__.py
@@ -0,0 +1,17 @@
+"""
+Copyright 2022-2023 Ilia Moiseev
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from .text_classification_folder import TextClassificationFolder
diff --git a/cascade/utils/nlp/text_classification_folder.py b/cascade/utils/nlp/text_classification_folder.py
@@ -19,16 +19,17 @@
 
 import numpy as np
 
-from ..base import PipeMeta
-from ..data import Dataset
+from ...base import PipeMeta
+from ...data import Dataset
 
 
-class TextClassificationDataset(Dataset):
+class TextClassificationFolder(Dataset):
     """
     Dataset to simplify loading of data for text classification.
     Texts of different classes should be placed in different folders.
     """
 
+    # TODO: can be implemented to be ClassificationFolder and share this functionality with images?
     def __init__(
         self, path: str, encoding: str = "utf-8", *args: Any, **kwargs: Any
     ) -> None:
diff --git a/cascade/utils/numpy_wrapper.py b/cascade/utils/numpy_wrapper.py
@@ -28,10 +28,4 @@ class NumpyWrapper(Wrapper):
     """
 
     def __init__(self, path: str, *args: Any, **kwargs: Any) -> None:
-        self._path = path
-        super().__init__(np.load(path), *args, **kwargs)
-
-    def get_meta(self) -> PipeMeta:
-        meta = super().get_meta()
-        meta[0]["root"] = self._path
-        return meta
+        raise ImportError("NumpyWrapper was removed since 0.12.0, consider using older version")
diff --git a/cascade/utils/pandera/__init__.py b/cascade/utils/pandera/__init__.py
@@ -0,0 +1,17 @@
+"""
+Copyright 2022-2023 Ilia Moiseev
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from .pa_schema_validator import PaSchemaValidator
diff --git a/cascade/utils/pandera/pa_schema_validator.py b/cascade/utils/pandera/pa_schema_validator.py
@@ -19,8 +19,8 @@
 import pandera.io as paio
 from pandera.errors import SchemaError
 
-from ..meta import AggregateValidator, DataValidationException
-from .tables import TableDataset
+from ...meta import AggregateValidator, DataValidationException
+from ..tables import TableDataset
 
 
 class PaSchemaValidator(AggregateValidator):
diff --git a/cascade/utils/sklearn/__init__.py b/cascade/utils/sklearn/__init__.py
@@ -0,0 +1,17 @@
+"""
+Copyright 2022-2023 Ilia Moiseev
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from .sk_model import SkModel
diff --git a/cascade/utils/sklearn/sk_model.py b/cascade/utils/sklearn/sk_model.py
@@ -22,8 +22,8 @@
 
 from sklearn.pipeline import Pipeline
 
-from ..base import MetaHandler, PipeMeta
-from ..models import BasicModel
+from ...base import MetaHandler, PipeMeta
+from ...models import BasicModel
 
 
 class SkModel(BasicModel):
diff --git a/cascade/utils/tables/__init__.py b/cascade/utils/tables/__init__.py
@@ -0,0 +1,17 @@
+"""
+Copyright 2022-2023 Ilia Moiseev
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from .tables import TableDataset, TableFilter, TableIterator, PartedTableLoader, LargeCSVDataset, FeatureTable
diff --git a/cascade/utils/tables/tables.py b/cascade/utils/tables/tables.py
@@ -17,14 +17,11 @@
 from typing import Any, Callable, List, Literal, Tuple, Union
 
 import pandas as pd
-from dask import dataframe as dd
 from tqdm import tqdm
 
-from cascade.base import PipeMeta
-
-from ..base import PipeMeta
-from ..data import Dataset, Iterator, Modifier, SequentialCacher
-from ..meta import AggregateValidator, DataValidationException
+from ...base import PipeMeta
+from ...data import Dataset, Iterator, Modifier
+from ...meta import AggregateValidator, DataValidationException
 
 
 class TableDataset(Dataset):
@@ -132,33 +129,6 @@ def __init__(self, csv_file_path: str, *args: Any, **kwargs: Any) -> None:
         super().__init__(t=t, **kwargs)
 
 
-class PartedTableLoader(Dataset):
-    """
-    Works like CSVDataset, but uses dask to load tables
-    and returns partitions on `__getitem__`.
-
-    See also
-    --------
-    cascade.utils.CSVDataset
-    """
-
-    def __init__(self, csv_file_path: str, *args: Any, **kwargs: Any) -> None:
-        super().__init__(**kwargs)
-        self._table = dd.read_csv(csv_file_path, *args, **kwargs)
-
-    def __getitem__(self, index: int):
-        """
-        Returns partition under the index.
-        """
-        return self._table.get_partition(index).compute()
-
-    def __len__(self) -> int:
-        """
-        Returns the number of partitions.
-        """
-        return self._table.npartitions
-
-
 class TableIterator(Iterator):
     """
     Iterates over the table from path by the chunks.
@@ -182,26 +152,6 @@ def __next__(self):
         return self._data.get_chunk(self.chunk_size)
 
 
-class LargeCSVDataset(SequentialCacher):
-    """
-    SequentialCacher over large .csv file.
-    Loads table by partitions.
-    """
-
-    def __init__(self, csv_file_path: str, *args: Any, **kwargs: Any) -> None:
-        dataset = PartedTableLoader(csv_file_path, *args, **kwargs)
-        self._ln = len(dataset._table)
-        self.num_batches = dataset._table.npartitions
-        self.bs = self._ln // self.num_batches
-        super().__init__(dataset, self.bs)
-
-    def _load(self, index: int) -> None:
-        self._batch = TableDataset(t=self._dataset[index])
-
-    def __len__(self) -> int:
-        return self._ln
-
-
 class NullValidator(TableDataset, AggregateValidator):
     """
     Checks that there are no null values in the table.
@@ -240,7 +190,7 @@ def __init__(
         ```python
         >>> import pandas as pd
         >>> from cascade.utils.tables import FeatureTable
-        >>> df = pd.read_csv(r'C:\cascade_integration\data\t.csv', index_col=0)
+        >>> df = pd.read_csv(r'data\t.csv', index_col=0)
         >>> df
         id  count  name
         0   0      1   aaa
@@ -370,3 +320,13 @@ def get_meta(self) -> PipeMeta:
             for key in self._computed_features_kwargs
         }
         return meta
+
+
+class PartedTableLoader(TableDataset):
+    def __init__(self, *args: Any, t = None, **kwargs: Any) -> None:
+        raise ImportError("PartedTableLoader was removed since 0.12.0, consider using older version")
+
+
+class LargeCSVDataset(TableDataset):
+    def __init__(self, *args: Any, t = None, **kwargs: Any) -> None:
+        raise ImportError("LargeCSVDataset was removed since 0.12.0, consider using older version")
diff --git a/cascade/utils/tests/test_baselines.py b/cascade/utils/tests/test_baselines.py
@@ -39,3 +39,4 @@ def test():
         model.predict([0, 0, 0])
         == np.array([[[1, 0], [0, 1]], [[1, 0], [0, 1]], [[1, 0], [0, 1]]])
     )
+
diff --git a/cascade/utils/tests/test_folder_image_dataset.py b/cascade/utils/tests/test_folder_image_dataset.py
@@ -26,7 +26,7 @@
 )
 sys.path.append(os.path.dirname(MODULE_PATH))
 
-from cascade.utils.folder_image_dataset import FolderImageDataset
+from cascade.utils.vision import FolderImageDataset
 
 
 @pytest.fixture
diff --git a/cascade/utils/tests/test_numpy_wrapper.py b/cascade/utils/tests/test_numpy_wrapper.py
@@ -17,6 +17,7 @@
 import os
 import sys
 
+import pytest
 import numpy as np
 
 MODULE_PATH = os.path.dirname(
@@ -27,6 +28,7 @@
 from cascade.utils.numpy_wrapper import NumpyWrapper
 
 
+@pytest.mark.skip
 def test(tmp_path):
     arr = np.array([1, 2, 3, 4, 5])
     path = os.path.join(tmp_path, "arr.npy")
diff --git a/cascade/utils/tests/test_sk_model.py b/cascade/utils/tests/test_sk_model.py
@@ -29,7 +29,7 @@
 
 
 import cascade as csd
-from cascade.utils.sk_model import SkModel
+from cascade.utils.sklearn import SkModel
 
 
 @pytest.mark.parametrize("ext", [".json", ".yml"])
diff --git a/cascade/utils/tests/test_text_classification_dataset.py b/cascade/utils/tests/test_text_classification_dataset.py
@@ -22,7 +22,7 @@
 )
 sys.path.append(os.path.dirname(MODULE_PATH))
 
-from cascade.utils.text_classification_dataset import TextClassificationDataset
+from cascade.utils.nlp import TextClassificationFolder
 
 
 def test_create(tmp_path):
@@ -38,7 +38,7 @@ def test_create(tmp_path):
         with open(os.path.join(path, "text_2.txt"), "w") as f:
             f.write("hello")
 
-    ds = TextClassificationDataset(tmp_path)
+    ds = TextClassificationFolder(tmp_path)
     meta = ds.get_meta()[0]
 
     assert meta["size"] == 6
diff --git a/cascade/utils/tests/test_torch_model.py b/cascade/utils/tests/test_torch_model.py
@@ -26,7 +26,7 @@
 sys.path.append(os.path.dirname(MODULE_PATH))
 
 
-from cascade.utils.torch_model import TorchModel
+from cascade.utils.torch import TorchModel
 
 
 @pytest.mark.parametrize("postfix", ["", "model", "model.pt"])
diff --git a/cascade/utils/time_series/__init__.py b/cascade/utils/time_series/__init__.py
@@ -0,0 +1,18 @@
+"""
+Copyright 2022-2023 Ilia Moiseev
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from .time_series_dataset import TimeSeriesDataset
+from .time_series import Average, Interpolate, Align
diff --git a/cascade/utils/time_series/time_series.py b/cascade/utils/time_series/time_series.py
diff --git a/cascade/utils/time_series/time_series_dataset.py b/cascade/utils/time_series/time_series_dataset.py
diff --git a/cascade/utils/torch/__init__.py b/cascade/utils/torch/__init__.py
diff --git a/cascade/utils/torch/torch_model.py b/cascade/utils/torch/torch_model.py
diff --git a/cascade/utils/vision/__init__.py b/cascade/utils/vision/__init__.py
diff --git a/cascade/utils/vision/folder_image_dataset.py b/cascade/utils/vision/folder_image_dataset.py
diff --git a/utils_requirements.txt b/utils_requirements.txt

Original file line number	Diff line number	Diff line change
`@@ -39,3 +39,4 @@ def test():`
`39`	`39`	`model.predict([0, 0, 0])`
`40`	`40`	`== np.array([[[1, 0], [0, 1]], [[1, 0], [0, 1]], [[1, 0], [0, 1]]])`
`41`	`41`	`)`
	`42`	`+`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`)`
`27`	`27`	`sys.path.append(os.path.dirname(MODULE_PATH))`
`28`	`28`
`29`		`-from cascade.utils.folder_image_dataset import FolderImageDataset`
	`29`	`+from cascade.utils.vision import FolderImageDataset`
`30`	`30`
`31`	`31`
`32`	`32`	`@pytest.fixture`