modin-project · anmyachev · Feb 15, 2021 · Feb 11, 2021 · Feb 11, 2021 · Feb 12, 2021
@@ -17,10 +17,8 @@
 # define `MODIN_ASV_USE_IMPL` env var to choose library for using in performance
 # measurements
 
-import os
 import modin.pandas as pd
 import numpy as np
-import pandas
 
 from .utils import (
     generate_dataframe,
@@ -29,71 +27,15 @@
     random_string,
     random_columns,
     random_booleans,
+    ASV_USE_IMPL,
+    ASV_DATASET_SIZE,
+    BINARY_OP_DATA_SIZE,
+    UNARY_OP_DATA_SIZE,
+    GROUPBY_NGROUPS,
+    IMPL,
+    execute,
 )
 
-try:
-    from modin.config import NPartitions
-
-    NPARTITIONS = NPartitions.get()
-except ImportError:
-    NPARTITIONS = pd.DEFAULT_NPARTITIONS
-
-try:
-    from modin.config import TestDatasetSize, AsvImplementation
-
-    ASV_USE_IMPL = AsvImplementation.get()
-    ASV_DATASET_SIZE = TestDatasetSize.get() or "Small"
-except ImportError:
-    # The same benchmarking code can be run for different versions of Modin, so in
-    # case of an error importing important variables, we'll just use predefined values
-    ASV_USE_IMPL = os.environ.get("MODIN_ASV_USE_IMPL", "modin")
-    ASV_DATASET_SIZE = os.environ.get("MODIN_TEST_DATASET_SIZE", "Small")
-
-assert ASV_USE_IMPL in ("modin", "pandas")
-
-BINARY_OP_DATA_SIZE = {
-    "Big": [
-        ((5000, 5000), (5000, 5000)),
-        # the case extremely inefficient
-        # ((20, 500_000), (10, 1_000_000)),
-        ((500_000, 20), (1_000_000, 10)),
-    ],
-    "Small": [
-        ((250, 250), (250, 250)),
-        ((20, 10_000), (10, 25_000)),
-        ((10_000, 20), (25_000, 10)),
-    ],
-}
-
-UNARY_OP_DATA_SIZE = {
-    "Big": [
-        (5000, 5000),
-        # the case extremely inefficient
-        # (10, 1_000_000),
-        (1_000_000, 10),
-    ],
-    "Small": [
-        (250, 250),
-        (10, 10_000),
-        (10_000, 10),
-    ],
-}
-
-GROUPBY_NGROUPS = {
-    "Big": 100,
-    "Small": 5,
-}
-
-IMPL = {
-    "modin": pd,
-    "pandas": pandas,
-}
-
-
-def execute(df):
-    "Make sure the calculations are done."
-    return df.shape, df.dtypes
-
 
 class BaseTimeGroupBy:
     def setup(self, shape, groupby_ncols=1):

@@ -0,0 +1,12 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership.  The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
@@ -0,0 +1,65 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership.  The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import modin.pandas as pd
+import numpy as np
+
+from ..utils import (
+    generate_dataframe,
+    RAND_LOW,
+    RAND_HIGH,
+    ASV_USE_IMPL,
+    ASV_DATASET_SIZE,
+    UNARY_OP_DATA_SIZE,
+    IMPL,
+    execute,
+    get_array_id,
+)
+
+# ray init
+if ASV_USE_IMPL == "modin":
+    pd.DataFrame([])
+
+
+class BaseReadCsv:
+    # test data file can de created only once
+    def setup_cache(self, test_filename="io_test_file"):
+        test_filenames = {}
+        for shape in UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE]:
+            data_id = get_array_id(shape)
+            test_filenames[data_id] = f"{test_filename}_{data_id}.csv"
+            df = generate_dataframe("pandas", "str_int", *shape, RAND_LOW, RAND_HIGH)
+            df.to_csv(test_filenames[data_id], index=False)
+
+        return test_filenames
+
+    def setup(self, test_filenames, shape, *args, **kwargs):
+        self.data_id = get_array_id(shape)
+
+
+class TimeReadCsvSkiprows(BaseReadCsv):
+    param_names = ["shape", "skiprows"]
+    params = [
+        UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+        [
+            None,
+            lambda x: x % 2,
+            np.arange(1, UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE][0][0] // 10),
+            np.arange(1, UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE][0][0], 2),
+        ],
+    ]
+
+    def time_skiprows(self, test_filenames, shape, skiprows):
+        execute(
+            IMPL[ASV_USE_IMPL].read_csv(test_filenames[self.data_id], skiprows=skiprows)
+        )
@@ -11,6 +11,7 @@
 # ANY KIND, either express or implied. See the License for the specific language
 # governing permissions and limitations under the License.
 
+import os
 import logging
 import modin.pandas as pd
 import pandas
@@ -22,6 +23,65 @@
 random_state = np.random.RandomState(seed=42)
 
 
+try:
+    from modin.config import NPartitions
+
+    NPARTITIONS = NPartitions.get()
+except ImportError:
+    NPARTITIONS = pd.DEFAULT_NPARTITIONS
+
+try:
+    from modin.config import TestDatasetSize, AsvImplementation
+
+    ASV_USE_IMPL = AsvImplementation.get()
+    ASV_DATASET_SIZE = TestDatasetSize.get() or "Small"
+except ImportError:
+    # The same benchmarking code can be run for different versions of Modin, so in
+    # case of an error importing important variables, we'll just use predefined values
+    ASV_USE_IMPL = os.environ.get("MODIN_ASV_USE_IMPL", "modin")
+    ASV_DATASET_SIZE = os.environ.get("MODIN_TEST_DATASET_SIZE", "Small")
+
+assert ASV_USE_IMPL in ("modin", "pandas")
+
+BINARY_OP_DATA_SIZE = {
+    "Big": [
+        ((5000, 5000), (5000, 5000)),
+        # the case extremely inefficient
+        # ((20, 500_000), (10, 1_000_000)),
+        ((500_000, 20), (1_000_000, 10)),
+    ],
+    "Small": [
+        ((250, 250), (250, 250)),
+        ((20, 10_000), (10, 25_000)),
+        ((10_000, 20), (25_000, 10)),
+    ],
+}
+
+UNARY_OP_DATA_SIZE = {
+    "Big": [
+        (5000, 5000),
+        # the case extremely inefficient
+        # (10, 1_000_000),
+        (1_000_000, 10),
+    ],
+    "Small": [
+        (250, 250),
+        (10, 10_000),
+        (10_000, 10),
+    ],
+}
+
+GROUPBY_NGROUPS = {
+    "Big": 100,
+    "Small": 5,
+}
+
+IMPL = {
+    "modin": pd,
+    "pandas": pandas,
+}
+
+
 class weakdict(dict):
     __slots__ = ("__weakref__",)
 
@@ -144,3 +204,12 @@ def random_columns(df_columns, columns_number):
 
 def random_booleans(number):
     return list(random_state.choice([True, False], size=number))
+
+
+def execute(df):
+    "Make sure the calculations are done."
+    return df.shape, df.dtypes
+
+
+def get_array_id(array):
+    return "_".join([str(element) for element in array])