Fix bug that causes workers to crash

Devin Petersohn · Devin Petersohn · commit 49ccb0523e66 · 2018-09-20T22:15:43.000-07:00
diff --git a/modin/pandas/io.py b/modin/pandas/io.py
@@ -28,35 +28,46 @@
 def read_parquet(path, engine='auto', columns=None, **kwargs):
     """Load a parquet object from the file path, returning a DataFrame.
        Ray DataFrame only supports pyarrow engine for now.
-
     Args:
         path: The filepath of the parquet file.
               We only support local files for now.
         engine: Ray only support pyarrow reader.
                 This argument doesn't do anything for now.
         kwargs: Pass into parquet's read_pandas function.
-
     Notes:
         ParquetFile API is used. Please refer to the documentation here
         https://arrow.apache.org/docs/python/parquet.html
     """
+    return _read_parquet_pandas_on_ray(path, engine, columns, **kwargs)
+
+
+def _read_parquet_pandas_on_ray(path, engine, columns, **kwargs):
+    from pyarrow.parquet import ParquetFile
+
     if not columns:
         pf = ParquetFile(path)
         columns = [
             name for name in pf.metadata.schema.names
             if not PQ_INDEX_REGEX.match(name)
         ]
-
+    num_splits = min(
+        len(columns), RayBlockPartitions._compute_num_partitions())
     # Each item in this list will be a column of original df
     # partitioned to smaller pieces along rows.
     # We need to transpose the oids array to fit our schema.
-    blk_partitions = [
-        ray.get(_read_parquet_column.remote(path, col, kwargs))
-        for col in columns
-    ]
-    blk_partitions = np.array(blk_partitions).T
-
-    return DataFrame(block_partitions=blk_partitions, columns=columns)
+    blk_partitions = np.array([
+        _read_parquet_column._submit(
+            args=(path, col, num_splits, kwargs),
+            num_return_vals=num_splits + 1) for col in columns
+    ]).T
+    remote_partitions = np.array([[RayRemotePartition(obj) for obj in row]
+                                  for row in blk_partitions[:-1]])
+    index_len = ray.get(blk_partitions[-1][0])
+    index = pandas.RangeIndex(index_len)
+    new_manager = PandasDataManager(
+        RayBlockPartitions(remote_partitions), index, columns)
+    df = DataFrame(data_manager=new_manager)
+    return df
 
 
 # CSV
@@ -555,6 +566,13 @@ def _read_csv_with_offset_pandas_on_ray(fname, num_splits, start, end, kwargs, h
 
 @ray.remote
 def _read_parquet_column(path, column, kwargs={}):
+    import pyarrow.parquet as pq
+    df = pq.read_pandas(path, columns=[column], **kwargs).to_pandas()
+    # Append the length of the index here to build it externally
+    return split_result_of_axis_func_pandas(0, num_splits,
+                                            df) + [len(df.index)]
+    # IMPORTANT: DO NOT DELETE THE CODE BELOW
+    # Deleting this code for some reason causes workers to crash on high-CPU machines.
     df = pq.read_pandas(path, columns=[column], **kwargs).to_pandas()
     oids = _partition_pandas_dataframe(df, num_partitions=get_npartitions())
     return oids