Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEST-#2722: add ASV read_csv skiprows benchmark #2724

Merged
merged 4 commits into from
Feb 15, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 7 additions & 65 deletions asv_bench/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,8 @@
# define `MODIN_ASV_USE_IMPL` env var to choose library for using in performance
# measurements

import os
import modin.pandas as pd
import numpy as np
import pandas

from .utils import (
generate_dataframe,
Expand All @@ -29,71 +27,15 @@
random_string,
random_columns,
random_booleans,
ASV_USE_IMPL,
ASV_DATASET_SIZE,
BINARY_OP_DATA_SIZE,
UNARY_OP_DATA_SIZE,
GROUPBY_NGROUPS,
IMPL,
execute,
)

try:
from modin.config import NPartitions

NPARTITIONS = NPartitions.get()
except ImportError:
NPARTITIONS = pd.DEFAULT_NPARTITIONS

try:
from modin.config import TestDatasetSize, AsvImplementation

ASV_USE_IMPL = AsvImplementation.get()
ASV_DATASET_SIZE = TestDatasetSize.get() or "Small"
except ImportError:
# The same benchmarking code can be run for different versions of Modin, so in
# case of an error importing important variables, we'll just use predefined values
ASV_USE_IMPL = os.environ.get("MODIN_ASV_USE_IMPL", "modin")
ASV_DATASET_SIZE = os.environ.get("MODIN_TEST_DATASET_SIZE", "Small")

assert ASV_USE_IMPL in ("modin", "pandas")

BINARY_OP_DATA_SIZE = {
"Big": [
((5000, 5000), (5000, 5000)),
# the case extremely inefficient
# ((20, 500_000), (10, 1_000_000)),
((500_000, 20), (1_000_000, 10)),
],
"Small": [
((250, 250), (250, 250)),
((20, 10_000), (10, 25_000)),
((10_000, 20), (25_000, 10)),
],
}

UNARY_OP_DATA_SIZE = {
"Big": [
(5000, 5000),
# the case extremely inefficient
# (10, 1_000_000),
(1_000_000, 10),
],
"Small": [
(250, 250),
(10, 10_000),
(10_000, 10),
],
}

GROUPBY_NGROUPS = {
"Big": 100,
"Small": 5,
}

IMPL = {
"modin": pd,
"pandas": pandas,
}


def execute(df):
"Make sure the calculations are done."
return df.shape, df.dtypes


class BaseTimeGroupBy:
def setup(self, shape, groupby_ncols=1):
Expand Down
12 changes: 12 additions & 0 deletions asv_bench/benchmarks/io/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
65 changes: 65 additions & 0 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import modin.pandas as pd
import numpy as np

from ..utils import (
generate_dataframe,
RAND_LOW,
RAND_HIGH,
ASV_USE_IMPL,
ASV_DATASET_SIZE,
UNARY_OP_DATA_SIZE,
IMPL,
execute,
get_array_id,
)

# ray init
if ASV_USE_IMPL == "modin":
pd.DataFrame([])


class BaseReadCsv:
# test data file can de created only once
def setup_cache(self, test_filename="io_test_file"):
test_filenames = {}
for shape in UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE]:
data_id = get_array_id(shape)
test_filenames[data_id] = f"{test_filename}_{data_id}.csv"
df = generate_dataframe("pandas", "str_int", *shape, RAND_LOW, RAND_HIGH)
df.to_csv(test_filenames[data_id], index=False)

return test_filenames

def setup(self, test_filenames, shape, *args, **kwargs):
self.data_id = get_array_id(shape)
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
amyskov marked this conversation as resolved.
Show resolved Hide resolved


class TimeReadCsvSkiprows(BaseReadCsv):
param_names = ["shape", "skiprows"]
params = [
UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
[
None,
lambda x: x % 2,
np.arange(1, UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE][0][0] // 10),
np.arange(1, UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE][0][0], 2),
],
]

def time_skiprows(self, test_filenames, shape, skiprows):
execute(
IMPL[ASV_USE_IMPL].read_csv(test_filenames[self.data_id], skiprows=skiprows)
)
69 changes: 69 additions & 0 deletions asv_bench/benchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import os
import logging
import modin.pandas as pd
import pandas
Expand All @@ -22,6 +23,65 @@
random_state = np.random.RandomState(seed=42)


try:
from modin.config import NPartitions

NPARTITIONS = NPartitions.get()
except ImportError:
NPARTITIONS = pd.DEFAULT_NPARTITIONS

try:
from modin.config import TestDatasetSize, AsvImplementation

ASV_USE_IMPL = AsvImplementation.get()
ASV_DATASET_SIZE = TestDatasetSize.get() or "Small"
except ImportError:
# The same benchmarking code can be run for different versions of Modin, so in
# case of an error importing important variables, we'll just use predefined values
ASV_USE_IMPL = os.environ.get("MODIN_ASV_USE_IMPL", "modin")
ASV_DATASET_SIZE = os.environ.get("MODIN_TEST_DATASET_SIZE", "Small")

assert ASV_USE_IMPL in ("modin", "pandas")

BINARY_OP_DATA_SIZE = {
"Big": [
((5000, 5000), (5000, 5000)),
# the case extremely inefficient
# ((20, 500_000), (10, 1_000_000)),
((500_000, 20), (1_000_000, 10)),
],
"Small": [
((250, 250), (250, 250)),
((20, 10_000), (10, 25_000)),
((10_000, 20), (25_000, 10)),
],
}

UNARY_OP_DATA_SIZE = {
"Big": [
(5000, 5000),
# the case extremely inefficient
# (10, 1_000_000),
(1_000_000, 10),
],
"Small": [
(250, 250),
(10, 10_000),
(10_000, 10),
],
}

GROUPBY_NGROUPS = {
"Big": 100,
"Small": 5,
}

IMPL = {
"modin": pd,
"pandas": pandas,
}


class weakdict(dict):
__slots__ = ("__weakref__",)

Expand Down Expand Up @@ -144,3 +204,12 @@ def random_columns(df_columns, columns_number):

def random_booleans(number):
return list(random_state.choice([True, False], size=number))


def execute(df):
"Make sure the calculations are done."
return df.shape, df.dtypes


def get_array_id(array):
return "_".join([str(element) for element in array])