Skip to content

Modin fails to load csv from s3 with ray client #2688

@Bhavya6187

Description

@Bhavya6187

System information

  • OS Platform and Distribution (e.g., Linux Ubuntu 16.04): Ubuntu
  • Modin version (modin.__version__): master (0.8.3+22.ge99b629)
  • Python version: 3.7
  • Code we can use to reproduce:
import ray
import os
import ray.util
ray.util.connect("<service_ip>:50051")
import modin.pandas as pd
pd.DEFAULT_NPARTITIONS = 10
df = pd.read_csv("s3://<bucket>/HIGGS_100k.csv")

Describe the problem

Modin fails to load csv from s3 with ray client and throws an error.

Source code / logs

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-10-9b3c648a226d> in <module>()
----> 1 df = pd.read_csv("s3://<s3_bucket>/HIGGS_100k.csv")

/home/bhavya.agarwal/.local/lib/python3.7/site-packages/modin/pandas/io.py in parser_func(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)
    114 
    115         kwargs = {k: v for k, v in f_locals.items() if k in _pd_read_csv_signature}
--> 116         return _read(**kwargs)
    117 
    118     return parser_func

/home/bhavya.agarwal/.local/lib/python3.7/site-packages/modin/pandas/io.py in _read(**kwargs)
    133 
    134     Engine.subscribe(_update_engine)
--> 135     pd_obj = EngineDispatcher.read_csv(**kwargs)
    136     # This happens when `read_csv` returns a TextFileReader object for iterating through
    137     if isinstance(pd_obj, pandas.io.parsers.TextFileReader):

/home/bhavya.agarwal/.local/lib/python3.7/site-packages/modin/data_management/factories/dispatcher.py in read_csv(cls, **kwargs)
    102     @classmethod
    103     def read_csv(cls, **kwargs):
--> 104         return cls.__engine._read_csv(**kwargs)
    105 
    106     @classmethod

/home/bhavya.agarwal/.local/lib/python3.7/site-packages/modin/data_management/factories/factories.py in _read_csv(cls, **kwargs)
     85     @classmethod
     86     def _read_csv(cls, **kwargs):
---> 87         return cls.io_cls.read_csv(**kwargs)
     88 
     89     @classmethod

/home/bhavya.agarwal/.local/lib/python3.7/site-packages/modin/engines/base/io/file_dispatcher.py in read(cls, *args, **kwargs)
     27     @classmethod
     28     def read(cls, *args, **kwargs):
---> 29         query_compiler = cls._read(*args, **kwargs)
     30         # TODO (devin-petersohn): Make this section more general for non-pandas kernel
     31         # implementations.

/home/bhavya.agarwal/.local/lib/python3.7/site-packages/modin/engines/base/io/text/csv_dispatcher.py in _read(cls, filepath_or_buffer, **kwargs)
    192         dtypes = cls.get_dtypes(dtypes_ids) if len(dtypes_ids) > 0 else None
    193 
--> 194         partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
    195         # If parse_dates is present, the column names that we have might not be
    196         # the same length as the returned column names. If we do need to modify

/home/bhavya.agarwal/.local/lib/python3.7/site-packages/modin/engines/base/io/text/text_file_dispatcher.py in build_partition(cls, partition_ids, row_lengths, column_widths)
     51                     for j in range(len(partition_ids[i]))
     52                 ]
---> 53                 for i in range(len(partition_ids))
     54             ]
     55         )

/home/bhavya.agarwal/.local/lib/python3.7/site-packages/modin/engines/base/io/text/text_file_dispatcher.py in <listcomp>(.0)
     51                     for j in range(len(partition_ids[i]))
     52                 ]
---> 53                 for i in range(len(partition_ids))
     54             ]
     55         )

/home/bhavya.agarwal/.local/lib/python3.7/site-packages/modin/engines/base/io/text/text_file_dispatcher.py in <listcomp>(.0)
     49                         width=column_widths[j],
     50                     )
---> 51                     for j in range(len(partition_ids[i]))
     52                 ]
     53                 for i in range(len(partition_ids))

/home/bhavya.agarwal/.local/lib/python3.7/site-packages/modin/engines/ray/pandas_on_ray/frame/partition.py in __init__(self, object_id, length, width, ip, call_queue)
     25 class PandasOnRayFramePartition(BaseFramePartition):
     26     def __init__(self, object_id, length=None, width=None, ip=None, call_queue=None):
---> 27         assert type(object_id) is ray.ObjectID
     28 
     29         self.oid = object_id

AssertionError:

Metadata

Metadata

Assignees

Labels

bug 🦗Something isn't working

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions