Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEST-#2753: add GroupBy benchmarsk with huge amount of groups #2754

Merged
merged 1 commit into from
Feb 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 40 additions & 24 deletions asv_bench/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,68 +38,76 @@


class BaseTimeGroupBy:
def setup(self, shape, groupby_ncols=1):
def setup(self, shape, ngroups=5, groupby_ncols=1):
if callable(ngroups):
ngroups = ngroups(shape[0])
self.df, self.groupby_columns = generate_dataframe(
ASV_USE_IMPL,
"int",
*shape,
RAND_LOW,
RAND_HIGH,
groupby_ncols,
count_groups=GROUPBY_NGROUPS[ASV_DATASET_SIZE],
count_groups=ngroups,
)


class TimeGroupByMultiColumn(BaseTimeGroupBy):
param_names = ["shape", "groupby_ncols"]
param_names = ["shape", "ngroups", "groupby_ncols"]
params = [
UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
GROUPBY_NGROUPS[ASV_DATASET_SIZE],
[6],
]

def time_groupby_agg_quan(self, shape, groupby_ncols):
def time_groupby_agg_quan(self, *args, **kwargs):
execute(self.df.groupby(by=self.groupby_columns).agg("quantile"))

def time_groupby_agg_mean(self, shape, groupby_ncols):
def time_groupby_agg_mean(self, *args, **kwargs):
execute(self.df.groupby(by=self.groupby_columns).apply(lambda df: df.mean()))


class TimeGroupByDefaultAggregations(BaseTimeGroupBy):
param_names = ["shape"]
param_names = ["shape", "ngroups"]
params = [
UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
GROUPBY_NGROUPS[ASV_DATASET_SIZE],
]

def time_groupby_count(self, shape):
def time_groupby_count(self, *args, **kwargs):
execute(self.df.groupby(by=self.groupby_columns).count())

def time_groupby_size(self, shape):
def time_groupby_size(self, *args, **kwargs):
execute(self.df.groupby(by=self.groupby_columns).size())

def time_groupby_sum(self, shape):
def time_groupby_sum(self, *args, **kwargs):
execute(self.df.groupby(by=self.groupby_columns).sum())

def time_groupby_mean(self, shape):
def time_groupby_mean(self, *args, **kwargs):
execute(self.df.groupby(by=self.groupby_columns).mean())


class TimeGroupByDictionaryAggregation(BaseTimeGroupBy):
param_names = ["shape", "operation_type"]
params = [UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE], ["reduction", "aggregation"]]
param_names = ["shape", "ngroups", "operation_type"]
params = [
UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
GROUPBY_NGROUPS[ASV_DATASET_SIZE],
["reduction", "aggregation"],
]
operations = {
"reduction": ["sum", "count", "prod"],
"aggregation": ["quantile", "std", "median"],
}

def setup(self, shape, operation_type):
super().setup(shape)
def setup(self, shape, ngroups, operation_type):
super().setup(shape, ngroups)
self.cols_to_agg = self.df.columns[1:4]
operations = self.operations[operation_type]
self.agg_dict = {
c: operations[i % len(operations)] for i, c in enumerate(self.cols_to_agg)
}

def time_groupby_dict_agg(self, shape, operation_type):
def time_groupby_dict_agg(self, *args, **kwargs):
execute(self.df.groupby(by=self.groupby_columns).agg(self.agg_dict))


Expand Down Expand Up @@ -391,7 +399,7 @@ class BaseTimeValueCounts:
"half": lambda shape: shape[1] // 2,
}

def setup(self, shape, subset="all"):
def setup(self, shape, ngroups=5, subset="all"):
try:
subset = self.subset_params[subset]
except KeyError:
Expand All @@ -406,28 +414,36 @@ def setup(self, shape, subset="all"):
RAND_LOW,
RAND_HIGH,
groupby_ncols=ncols,
count_groups=GROUPBY_NGROUPS[ASV_DATASET_SIZE],
count_groups=ngroups,
)
self.subset = self.df.columns[:ncols].tolist()


class TimeValueCountsFrame(BaseTimeValueCounts):
param_names = ["shape", "subset"]
params = [UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE], ["all", "half"]]
param_names = ["shape", "ngroups", "subset"]
params = [
UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
GROUPBY_NGROUPS[ASV_DATASET_SIZE],
["all", "half"],
]

def time_value_counts(self, *args, **kwargs):
execute(self.df.value_counts(subset=self.subset))


class TimeValueCountsSeries(BaseTimeValueCounts):
param_names = ["shape", "bins"]
params = [UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE], [None, 3]]
param_names = ["shape", "ngroups", "bins"]
params = [
UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
GROUPBY_NGROUPS[ASV_DATASET_SIZE],
[None, 3],
]

def setup(self, shape, bins):
super().setup(shape=shape)
def setup(self, shape, ngroups, bins):
super().setup(ngroups=ngroups, shape=shape)
self.df = self.df.iloc[:, 0]

def time_value_counts(self, shape, bins):
def time_value_counts(self, shape, ngroups, bins):
execute(self.df.value_counts(bins=bins))


Expand Down
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@
}

GROUPBY_NGROUPS = {
"Big": 100,
"Small": 5,
"Big": [100, lambda nrows: min(nrows // 2, 5000)],
"Small": [5],
}

IMPL = {
Expand Down