17
17
from typing import Any , Callable , List , Literal , Tuple , Union
18
18
19
19
import pandas as pd
20
- from dask import dataframe as dd
21
20
from tqdm import tqdm
22
21
23
- from cascade .base import PipeMeta
24
-
25
- from ..base import PipeMeta
26
- from ..data import Dataset , Iterator , Modifier , SequentialCacher
27
- from ..meta import AggregateValidator , DataValidationException
22
+ from ...base import PipeMeta
23
+ from ...data import Dataset , Iterator , Modifier
24
+ from ...meta import AggregateValidator , DataValidationException
28
25
29
26
30
27
class TableDataset (Dataset ):
@@ -132,33 +129,6 @@ def __init__(self, csv_file_path: str, *args: Any, **kwargs: Any) -> None:
132
129
super ().__init__ (t = t , ** kwargs )
133
130
134
131
135
- class PartedTableLoader (Dataset ):
136
- """
137
- Works like CSVDataset, but uses dask to load tables
138
- and returns partitions on `__getitem__`.
139
-
140
- See also
141
- --------
142
- cascade.utils.CSVDataset
143
- """
144
-
145
- def __init__ (self , csv_file_path : str , * args : Any , ** kwargs : Any ) -> None :
146
- super ().__init__ (** kwargs )
147
- self ._table = dd .read_csv (csv_file_path , * args , ** kwargs )
148
-
149
- def __getitem__ (self , index : int ):
150
- """
151
- Returns partition under the index.
152
- """
153
- return self ._table .get_partition (index ).compute ()
154
-
155
- def __len__ (self ) -> int :
156
- """
157
- Returns the number of partitions.
158
- """
159
- return self ._table .npartitions
160
-
161
-
162
132
class TableIterator (Iterator ):
163
133
"""
164
134
Iterates over the table from path by the chunks.
@@ -182,26 +152,6 @@ def __next__(self):
182
152
return self ._data .get_chunk (self .chunk_size )
183
153
184
154
185
- class LargeCSVDataset (SequentialCacher ):
186
- """
187
- SequentialCacher over large .csv file.
188
- Loads table by partitions.
189
- """
190
-
191
- def __init__ (self , csv_file_path : str , * args : Any , ** kwargs : Any ) -> None :
192
- dataset = PartedTableLoader (csv_file_path , * args , ** kwargs )
193
- self ._ln = len (dataset ._table )
194
- self .num_batches = dataset ._table .npartitions
195
- self .bs = self ._ln // self .num_batches
196
- super ().__init__ (dataset , self .bs )
197
-
198
- def _load (self , index : int ) -> None :
199
- self ._batch = TableDataset (t = self ._dataset [index ])
200
-
201
- def __len__ (self ) -> int :
202
- return self ._ln
203
-
204
-
205
155
class NullValidator (TableDataset , AggregateValidator ):
206
156
"""
207
157
Checks that there are no null values in the table.
@@ -240,7 +190,7 @@ def __init__(
240
190
```python
241
191
>>> import pandas as pd
242
192
>>> from cascade.utils.tables import FeatureTable
243
- >>> df = pd.read_csv(r'C:\cascade_integration\ data\t .csv', index_col=0)
193
+ >>> df = pd.read_csv(r'data\t .csv', index_col=0)
244
194
>>> df
245
195
id count name
246
196
0 0 1 aaa
@@ -370,3 +320,13 @@ def get_meta(self) -> PipeMeta:
370
320
for key in self ._computed_features_kwargs
371
321
}
372
322
return meta
323
+
324
+
325
+ class PartedTableLoader (TableDataset ):
326
+ def __init__ (self , * args : Any , t = None , ** kwargs : Any ) -> None :
327
+ raise ImportError ("PartedTableLoader was removed since 0.12.0, consider using older version" )
328
+
329
+
330
+ class LargeCSVDataset (TableDataset ):
331
+ def __init__ (self , * args : Any , t = None , ** kwargs : Any ) -> None :
332
+ raise ImportError ("LargeCSVDataset was removed since 0.12.0, consider using older version" )
0 commit comments