FIX-#2239: Compute row index start using pandas (#2240)

devin-petersohn · web-flow · commit a7d30932a9fe · 2020-10-27T20:28:25.000+03:00
* FIX-#2239: Compute row index start using pandas Signed-off-by: Devin Petersohn <devin.petersohn@gmail.com> * FIX-#2239: Documentation Signed-off-by: Devin Petersohn <devin.petersohn@gmail.com> * FIX-#2239: Improve testing for case Signed-off-by: Devin Petersohn <devin.petersohn@gmail.com>
diff --git a/modin/engines/base/io/text/csv_reader.py b/modin/engines/base/io/text/csv_reader.py
@@ -180,11 +180,6 @@ def _read(cls, filepath_or_buffer, **kwargs):
         if index_col is None:
             row_lengths = cls.materialize(index_ids)
             new_index = pandas.RangeIndex(sum(row_lengths))
-            # pandas has a really weird edge case here.
-            if kwargs.get("names", None) is not None and skiprows > 1:
-                new_index = pandas.RangeIndex(
-                    skiprows - 1, new_index.stop + skiprows - 1
-                )
         else:
             index_objs = cls.materialize(index_ids)
             row_lengths = [len(o) for o in index_objs]
diff --git a/modin/pandas/test/data/issue_2239.csv b/modin/pandas/test/data/issue_2239.csv
@@ -0,0 +1,146 @@
+1585542839.000000, 1585542839.000000, 1585542839.000000
+32.000000, 32.000000, 32.000000
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
@@ -1113,7 +1113,7 @@ def test_from_csv_chunksize(make_csv_file):
     df_equals(modin_df, pd_df)
 
 
-@pytest.mark.parametrize("nrows", [123, None])
+@pytest.mark.parametrize("nrows", [1, 2, 123, None])
 def test_from_csv_skiprows(make_csv_file, nrows):
     make_csv_file()
 
@@ -1129,6 +1129,22 @@ def test_from_csv_skiprows(make_csv_file, nrows):
     )
     df_equals(modin_df, pandas_df)
 
+    pandas_df = pandas.read_csv(
+        TEST_CSV_FILENAME,
+        header=None,
+        names=["c1", "c2", "c3", "c4"],
+        skiprows=2,
+        nrows=nrows,
+    )
+    modin_df = pd.read_csv(
+        TEST_CSV_FILENAME,
+        header=None,
+        names=["c1", "c2", "c3", "c4"],
+        skiprows=2,
+        nrows=nrows,
+    )
+    df_equals(modin_df, pandas_df)
+
     pandas_df = pandas.read_csv(
         TEST_CSV_FILENAME,
         names=["c1", "c2", "c3", "c4"],
@@ -1144,6 +1160,15 @@ def test_from_csv_skiprows(make_csv_file, nrows):
     df_equals(modin_df, pandas_df)
 
 
+@pytest.mark.parametrize("names", [list("XYZ"), None])
+@pytest.mark.parametrize("skiprows", [1, 2, 3, 4, None])
+def test_from_csv_skiprows_names(names, skiprows):
+    path = "modin/pandas/test/data/issue_2239.csv"
+    pandas_df = pandas.read_csv(path, names=names, skiprows=skiprows)
+    modin_df = pd.read_csv(path, names=names, skiprows=skiprows)
+    df_equals(pandas_df, modin_df)
+
+
 @pytest.mark.parametrize(
     "encoding", ["latin8", "ISO-8859-1", "latin1", "iso-8859-1", "cp1252", "utf8"]
 )