From 5877fb01cf078e20ddaeb819e9c6da1fdc87bf59 Mon Sep 17 00:00:00 2001 From: sbrugman Date: Wed, 7 Apr 2021 18:19:46 +0200 Subject: [PATCH] Benchmark introduction --- .github/workflows/benchmark.yml | 37 ++++++++++ requirements-test.txt | 1 + tests/benchmarks/bench.py | 74 +++++++++++++++++++ tests/performance/time_inf.py | 25 ------- tests/performance/time_kurtosis.py | 36 --------- tests/performance/time_mad.py | 56 -------------- tests/performance/time_mean.py | 36 --------- tests/performance/timings.py | 113 ----------------------------- 8 files changed, 112 insertions(+), 266 deletions(-) create mode 100644 .github/workflows/benchmark.yml create mode 100644 tests/benchmarks/bench.py delete mode 100644 tests/performance/time_inf.py delete mode 100644 tests/performance/time_kurtosis.py delete mode 100644 tests/performance/time_mad.py delete mode 100644 tests/performance/time_mean.py delete mode 100644 tests/performance/timings.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 000000000..24c1bf373 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,37 @@ +name: Performance Benchmarks + +on: + push: + branches: + - master + - develop + +jobs: + benchmark: + name: ${{ matrix.os }} x ${{ matrix.python }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest ] #, macos-latest, windows-latest ] + python: ['3.8'] + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python }} + - name: Run benchmark + run: | + pip install -r requirements.txt + pip install -r requirements-test.txt + pytest tests/benchmarks/bench.py --benchmark-json benchmark.json + - name: Store benchmark result + uses: rhysd/github-action-benchmark@v1 + with: + name: Pandas Profiling Benchmarks + tool: 'pytest' + output-file-path: benchmark.json + github-token: ${{ secrets.GITHUB_TOKEN }} + auto-push: true \ No newline at end of file diff --git a/requirements-test.txt b/requirements-test.txt index 89c4b5df1..137ae3666 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -3,6 +3,7 @@ coverage<5 codecov pytest-mypy pytest-cov +pytest-benchmark~=3.2.2 nbval pyarrow flake8 diff --git a/tests/benchmarks/bench.py b/tests/benchmarks/bench.py new file mode 100644 index 000000000..9cec766d1 --- /dev/null +++ b/tests/benchmarks/bench.py @@ -0,0 +1,74 @@ +import pandas as pd + +from pandas_profiling import ProfileReport +from pandas_profiling.utils.cache import cache_file + + +def test_titanic_explorative(benchmark): + file_name = cache_file( + "titanic.parquet", + "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet", + ) + + data = pd.read_parquet(file_name) + + def func(df): + profile = ProfileReport( + df, title="Titanic Dataset", explorative=True, progress_bar=False + ) + report = profile.to_html() + return report + + benchmark(func, data) + + +def test_titanic_default(benchmark): + file_name = cache_file( + "titanic.parquet", + "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet", + ) + + data = pd.read_parquet(file_name) + + def func(df): + profile = ProfileReport(df, title="Titanic Dataset", progress_bar=False) + report = profile.to_html() + return report + + benchmark(func, data) + + +def test_titanic_minimal(benchmark): + file_name = cache_file( + "titanic.parquet", + "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet", + ) + + data = pd.read_parquet(file_name) + + def func(df): + profile = ProfileReport( + df, title="Titanic Dataset", minimal=True, progress_bar=False + ) + report = profile.to_html() + return report + + benchmark(func, data) + + +def test_rdw_minimal(benchmark): + file_name = cache_file( + "rdw.parquet", + "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/rdw.parquet", + ) + + data = pd.read_parquet(file_name) + + def func(df): + profile = ProfileReport( + df, title="RDW Dataset", minimal=True, progress_bar=False + ) + report = profile.to_html() + return report + + benchmark(func, data) diff --git a/tests/performance/time_inf.py b/tests/performance/time_inf.py deleted file mode 100644 index ba2aecaa4..000000000 --- a/tests/performance/time_inf.py +++ /dev/null @@ -1,25 +0,0 @@ -import timeit - -testcode = """ -import numpy as np -import pandas as pd - -np.random.seed(12) -vals = np.random.random(10000) -series = pd.Series(vals) -series[series < 0.3] = np.nan -series[series < 0.2] = np.Inf - - - -def f1(series): - return len(series.loc[(~np.isfinite(series)) & series.notnull()]) - - -def f2(series): - return ((series == np.inf) | (series == -np.inf)).sum() -""" - - -print(timeit.timeit("f1(series)", number=10, setup=testcode)) -print(timeit.timeit("f2(series)", number=10, setup=testcode)) diff --git a/tests/performance/time_kurtosis.py b/tests/performance/time_kurtosis.py deleted file mode 100644 index dfa106272..000000000 --- a/tests/performance/time_kurtosis.py +++ /dev/null @@ -1,36 +0,0 @@ -import timeit - -testcode = """ -import numpy as np -import pandas as pd -import scipy.stats - -np.random.seed(12) -vals = np.random.random(1000) -series = pd.Series(vals) -series[series < 0.2] = pd.NA - -def f1(series): - arr = series.values - return scipy.stats.kurtosis(arr, bias=False, nan_policy='omit') - - -def f2(series): - arr = series.values - arr_without_nan = arr[~np.isnan(arr)] - return scipy.stats.kurtosis(arr_without_nan, bias=False) - - -def f3(series): - return series.kurtosis() - - -def f4(series): - return series[series.notna()].kurtosis() -""" - - -print(timeit.timeit("f1(series)", number=10, setup=testcode)) -print(timeit.timeit("f2(series)", number=10, setup=testcode)) -print(timeit.timeit("f3(series)", number=10, setup=testcode)) -print(timeit.timeit("f4(series)", number=10, setup=testcode)) diff --git a/tests/performance/time_mad.py b/tests/performance/time_mad.py deleted file mode 100644 index 8c6107614..000000000 --- a/tests/performance/time_mad.py +++ /dev/null @@ -1,56 +0,0 @@ -import timeit - -testcode = ''' -import numpy as np -import pandas as pd - -np.random.seed(12) -vals = np.random.random(1000) -series = pd.Series(vals) -series[series < 0.2] = pd.NA - - -def mad(arr): - """ Median Absolute Deviation: a "Robust" version of standard deviation. - Indices variabililty of the sample. - https://en.wikipedia.org/wiki/Median_absolute_deviation - """ - arr = np.ma.array(arr).compressed() # should be faster to not use masked arrays. - med = np.median(arr) - return np.median(np.abs(arr - med)) - - -def mad2(arr): - """ Median Absolute Deviation: a "Robust" version of standard deviation. - Indices variabililty of the sample. - https://en.wikipedia.org/wiki/Median_absolute_deviation - """ - med = np.median(arr) - return np.median(np.abs(arr - med)) - - -def f1(series): - arr = series.values - arr_without_nan = arr[~np.isnan(arr)] - return mad(arr_without_nan) - - -def f2(series): - arr = series.values - arr_without_nan = arr[~np.isnan(arr)] - return mad(arr_without_nan) - - -def f3(series): - return series.mad() - - -def f4(series): - return series[series.notna()].mad() -''' - - -print(timeit.timeit("f1(series)", number=10, setup=testcode)) -print(timeit.timeit("f2(series)", number=10, setup=testcode)) -print(timeit.timeit("f3(series)", number=10, setup=testcode)) -print(timeit.timeit("f4(series)", number=10, setup=testcode)) diff --git a/tests/performance/time_mean.py b/tests/performance/time_mean.py deleted file mode 100644 index f6149a4c0..000000000 --- a/tests/performance/time_mean.py +++ /dev/null @@ -1,36 +0,0 @@ -import timeit - -testcode = """ -import numpy as np -import pandas as pd - -np.random.seed(12) -vals = np.random.random(1000) -series = pd.Series(vals) -series[series < 0.2] = pd.NA - - -def f1(series): - arr = series.values - arr_without_nan = arr[~np.isnan(arr)] - return np.mean(arr_without_nan) - - -def f2(series): - arr = series.values - return np.nanmean(arr) - - -def f3(series): - return series.mean() - - -def f4(series): - return series[series.notna()].mean() -""" - - -print(timeit.timeit("f1(series)", number=10, setup=testcode)) -print(timeit.timeit("f2(series)", number=10, setup=testcode)) -print(timeit.timeit("f3(series)", number=10, setup=testcode)) -print(timeit.timeit("f4(series)", number=10, setup=testcode)) diff --git a/tests/performance/timings.py b/tests/performance/timings.py deleted file mode 100644 index acde9360d..000000000 --- a/tests/performance/timings.py +++ /dev/null @@ -1,113 +0,0 @@ -import timeit -from itertools import product -from string import ascii_lowercase - -import numpy as np -import pandas as pd -import seaborn as sns -from matplotlib import pyplot as plt - -from pandas_profiling import ProfileReport - - -def generate_column_names(n): - column_names = [] - iters = 1 - while len(column_names) < n: - column_names += list( - "".join(combo) for combo in product(ascii_lowercase, repeat=iters) - ) - iters += 1 - return column_names - - -def make_sample_data(cols, rows): - column_names = generate_column_names(cols) - - df = pd.DataFrame( - np.random.randint(0, 1000000, size=(rows, cols)), columns=column_names[0:cols] - ) - df = df.astype(str) - - assert df.shape == (rows, cols) - return df.copy() - - -def make_report_minimal(df): - report = ProfileReport( - df, - minimal=True, - pool_size=0, - sort="None", - title="Dataset with Numeric Categories", - ) - html = report.to_html() - assert type(html) == str and '

Dataset info

' in html - - -def make_report(df): - report = ProfileReport( - df, - minimal=False, - pool_size=0, - sort="None", - title="Dataset with Numeric Categories", - ) - html = report.to_html() - assert type(html) == str and '

Dataset info

' in html - - -def wrap_func(function): - def inner(df): - def double_inner(): - return function(df) - - return double_inner - - return inner - - -def time_report(func, cols, rows, runs=5): - df = make_sample_data(cols, rows) - print(df.shape) - test = wrap_func(func)(df.copy()) - return timeit.timeit(test, number=runs) / runs - - -def plot_col_run_time(): - cols = [2, 4, 10, 50] - row = 1000 - default_times = [time_report(make_report, col, row) for col in cols] - minimal_times = [time_report(make_report_minimal, col, row) for col in cols] - - ax1 = sns.scatterplot(cols, default_times) - ax2 = sns.scatterplot(cols, minimal_times) - _ = ax1.set( - xlabel=f"Number of columns (row={row})", - ylabel="time (s)", - title="Run Time Complexity", - ) - plt.show() - - -def plot_row_run_time(): - # 10, 100 - # https://github.com/pandas-profiling/pandas-profiling/issues/270 - rows = [1000, 10000, 100000] - col = 10 - default_times = [time_report(make_report, col, row) for row in rows] - minimal_times = [time_report(make_report_minimal, col, row) for row in rows] - - ax1 = sns.scatterplot(rows, default_times) - ax2 = sns.scatterplot(rows, minimal_times) - _ = ax1.set( - xlabel=f"Number of rows (col={col})", - ylabel="time (s)", - title="Run Time Complexity", - ) - plt.show() - - -if __name__ == "__main__": - plot_col_run_time() - plot_row_run_time()