From 5877fb01cf078e20ddaeb819e9c6da1fdc87bf59 Mon Sep 17 00:00:00 2001
From: sbrugman <sfbbrugman@gmail.com>
Date: Wed, 7 Apr 2021 18:19:46 +0200
Subject: [PATCH] Benchmark introduction

---
 .github/workflows/benchmark.yml    |  37 ++++++++++
 requirements-test.txt              |   1 +
 tests/benchmarks/bench.py          |  74 +++++++++++++++++++
 tests/performance/time_inf.py      |  25 -------
 tests/performance/time_kurtosis.py |  36 ---------
 tests/performance/time_mad.py      |  56 --------------
 tests/performance/time_mean.py     |  36 ---------
 tests/performance/timings.py       | 113 -----------------------------
 8 files changed, 112 insertions(+), 266 deletions(-)
 create mode 100644 .github/workflows/benchmark.yml
 create mode 100644 tests/benchmarks/bench.py
 delete mode 100644 tests/performance/time_inf.py
 delete mode 100644 tests/performance/time_kurtosis.py
 delete mode 100644 tests/performance/time_mad.py
 delete mode 100644 tests/performance/time_mean.py
 delete mode 100644 tests/performance/timings.py

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 000000000..24c1bf373
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,37 @@
+name: Performance Benchmarks
+
+on:
+  push:
+    branches:
+      - master
+      - develop
+
+jobs:
+  benchmark:
+    name: ${{ matrix.os }} x ${{ matrix.python }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest ] #, macos-latest, windows-latest ]
+        python: ['3.8']
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-python@v1
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Run benchmark
+        run: |
+          pip install -r requirements.txt
+          pip install -r requirements-test.txt
+          pytest tests/benchmarks/bench.py --benchmark-json benchmark.json
+      - name: Store benchmark result
+        uses: rhysd/github-action-benchmark@v1
+        with:
+          name: Pandas Profiling Benchmarks
+          tool: 'pytest'
+          output-file-path: benchmark.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          auto-push: true
\ No newline at end of file
diff --git a/requirements-test.txt b/requirements-test.txt
index 89c4b5df1..137ae3666 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -3,6 +3,7 @@ coverage<5
 codecov
 pytest-mypy
 pytest-cov
+pytest-benchmark~=3.2.2
 nbval
 pyarrow
 flake8
diff --git a/tests/benchmarks/bench.py b/tests/benchmarks/bench.py
new file mode 100644
index 000000000..9cec766d1
--- /dev/null
+++ b/tests/benchmarks/bench.py
@@ -0,0 +1,74 @@
+import pandas as pd
+
+from pandas_profiling import ProfileReport
+from pandas_profiling.utils.cache import cache_file
+
+
+def test_titanic_explorative(benchmark):
+    file_name = cache_file(
+        "titanic.parquet",
+        "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet",
+    )
+
+    data = pd.read_parquet(file_name)
+
+    def func(df):
+        profile = ProfileReport(
+            df, title="Titanic Dataset", explorative=True, progress_bar=False
+        )
+        report = profile.to_html()
+        return report
+
+    benchmark(func, data)
+
+
+def test_titanic_default(benchmark):
+    file_name = cache_file(
+        "titanic.parquet",
+        "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet",
+    )
+
+    data = pd.read_parquet(file_name)
+
+    def func(df):
+        profile = ProfileReport(df, title="Titanic Dataset", progress_bar=False)
+        report = profile.to_html()
+        return report
+
+    benchmark(func, data)
+
+
+def test_titanic_minimal(benchmark):
+    file_name = cache_file(
+        "titanic.parquet",
+        "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet",
+    )
+
+    data = pd.read_parquet(file_name)
+
+    def func(df):
+        profile = ProfileReport(
+            df, title="Titanic Dataset", minimal=True, progress_bar=False
+        )
+        report = profile.to_html()
+        return report
+
+    benchmark(func, data)
+
+
+def test_rdw_minimal(benchmark):
+    file_name = cache_file(
+        "rdw.parquet",
+        "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/rdw.parquet",
+    )
+
+    data = pd.read_parquet(file_name)
+
+    def func(df):
+        profile = ProfileReport(
+            df, title="RDW Dataset", minimal=True, progress_bar=False
+        )
+        report = profile.to_html()
+        return report
+
+    benchmark(func, data)
diff --git a/tests/performance/time_inf.py b/tests/performance/time_inf.py
deleted file mode 100644
index ba2aecaa4..000000000
--- a/tests/performance/time_inf.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import timeit
-
-testcode = """
-import numpy as np
-import pandas as pd
-
-np.random.seed(12)
-vals = np.random.random(10000)
-series = pd.Series(vals)
-series[series < 0.3] = np.nan
-series[series < 0.2] = np.Inf
-
-
-
-def f1(series):
-    return len(series.loc[(~np.isfinite(series)) & series.notnull()])
-    
-    
-def f2(series):
-    return ((series == np.inf) | (series == -np.inf)).sum()
-"""
-
-
-print(timeit.timeit("f1(series)", number=10, setup=testcode))
-print(timeit.timeit("f2(series)", number=10, setup=testcode))
diff --git a/tests/performance/time_kurtosis.py b/tests/performance/time_kurtosis.py
deleted file mode 100644
index dfa106272..000000000
--- a/tests/performance/time_kurtosis.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import timeit
-
-testcode = """
-import numpy as np
-import pandas as pd
-import scipy.stats
-
-np.random.seed(12)
-vals = np.random.random(1000)
-series = pd.Series(vals)
-series[series < 0.2] = pd.NA
-
-def f1(series):
-    arr = series.values
-    return scipy.stats.kurtosis(arr, bias=False, nan_policy='omit')
-
-
-def f2(series):
-    arr = series.values
-    arr_without_nan = arr[~np.isnan(arr)]
-    return scipy.stats.kurtosis(arr_without_nan, bias=False)
-
-
-def f3(series):
-    return series.kurtosis()
-
-
-def f4(series):
-    return series[series.notna()].kurtosis()
-"""
-
-
-print(timeit.timeit("f1(series)", number=10, setup=testcode))
-print(timeit.timeit("f2(series)", number=10, setup=testcode))
-print(timeit.timeit("f3(series)", number=10, setup=testcode))
-print(timeit.timeit("f4(series)", number=10, setup=testcode))
diff --git a/tests/performance/time_mad.py b/tests/performance/time_mad.py
deleted file mode 100644
index 8c6107614..000000000
--- a/tests/performance/time_mad.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import timeit
-
-testcode = '''
-import numpy as np
-import pandas as pd
-
-np.random.seed(12)
-vals = np.random.random(1000)
-series = pd.Series(vals)
-series[series < 0.2] = pd.NA
-
-
-def mad(arr):
-    """ Median Absolute Deviation: a "Robust" version of standard deviation.
-        Indices variabililty of the sample.
-        https://en.wikipedia.org/wiki/Median_absolute_deviation 
-    """
-    arr = np.ma.array(arr).compressed() # should be faster to not use masked arrays.
-    med = np.median(arr)
-    return np.median(np.abs(arr - med))
-    
-    
-def mad2(arr):
-    """ Median Absolute Deviation: a "Robust" version of standard deviation.
-        Indices variabililty of the sample.
-        https://en.wikipedia.org/wiki/Median_absolute_deviation 
-    """
-    med = np.median(arr)
-    return np.median(np.abs(arr - med))
-    
-    
-def f1(series):
-    arr = series.values
-    arr_without_nan = arr[~np.isnan(arr)]
-    return mad(arr_without_nan)
-    
-    
-def f2(series):
-    arr = series.values
-    arr_without_nan = arr[~np.isnan(arr)]
-    return mad(arr_without_nan)
-    
-
-def f3(series):
-    return series.mad()
-
-
-def f4(series):
-    return series[series.notna()].mad()
-'''
-
-
-print(timeit.timeit("f1(series)", number=10, setup=testcode))
-print(timeit.timeit("f2(series)", number=10, setup=testcode))
-print(timeit.timeit("f3(series)", number=10, setup=testcode))
-print(timeit.timeit("f4(series)", number=10, setup=testcode))
diff --git a/tests/performance/time_mean.py b/tests/performance/time_mean.py
deleted file mode 100644
index f6149a4c0..000000000
--- a/tests/performance/time_mean.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import timeit
-
-testcode = """
-import numpy as np
-import pandas as pd
-
-np.random.seed(12)
-vals = np.random.random(1000)
-series = pd.Series(vals)
-series[series < 0.2] = pd.NA
-
-
-def f1(series):
-    arr = series.values
-    arr_without_nan = arr[~np.isnan(arr)]
-    return np.mean(arr_without_nan)
-    
-    
-def f2(series):
-    arr = series.values
-    return np.nanmean(arr)
-
-
-def f3(series):
-    return series.mean()
-
-
-def f4(series):
-    return series[series.notna()].mean()
-"""
-
-
-print(timeit.timeit("f1(series)", number=10, setup=testcode))
-print(timeit.timeit("f2(series)", number=10, setup=testcode))
-print(timeit.timeit("f3(series)", number=10, setup=testcode))
-print(timeit.timeit("f4(series)", number=10, setup=testcode))
diff --git a/tests/performance/timings.py b/tests/performance/timings.py
deleted file mode 100644
index acde9360d..000000000
--- a/tests/performance/timings.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import timeit
-from itertools import product
-from string import ascii_lowercase
-
-import numpy as np
-import pandas as pd
-import seaborn as sns
-from matplotlib import pyplot as plt
-
-from pandas_profiling import ProfileReport
-
-
-def generate_column_names(n):
-    column_names = []
-    iters = 1
-    while len(column_names) < n:
-        column_names += list(
-            "".join(combo) for combo in product(ascii_lowercase, repeat=iters)
-        )
-        iters += 1
-    return column_names
-
-
-def make_sample_data(cols, rows):
-    column_names = generate_column_names(cols)
-
-    df = pd.DataFrame(
-        np.random.randint(0, 1000000, size=(rows, cols)), columns=column_names[0:cols]
-    )
-    df = df.astype(str)
-
-    assert df.shape == (rows, cols)
-    return df.copy()
-
-
-def make_report_minimal(df):
-    report = ProfileReport(
-        df,
-        minimal=True,
-        pool_size=0,
-        sort="None",
-        title="Dataset with <em>Numeric</em> Categories",
-    )
-    html = report.to_html()
-    assert type(html) == str and '<p class="h2">Dataset info</p>' in html
-
-
-def make_report(df):
-    report = ProfileReport(
-        df,
-        minimal=False,
-        pool_size=0,
-        sort="None",
-        title="Dataset with <em>Numeric</em> Categories",
-    )
-    html = report.to_html()
-    assert type(html) == str and '<p class="h2">Dataset info</p>' in html
-
-
-def wrap_func(function):
-    def inner(df):
-        def double_inner():
-            return function(df)
-
-        return double_inner
-
-    return inner
-
-
-def time_report(func, cols, rows, runs=5):
-    df = make_sample_data(cols, rows)
-    print(df.shape)
-    test = wrap_func(func)(df.copy())
-    return timeit.timeit(test, number=runs) / runs
-
-
-def plot_col_run_time():
-    cols = [2, 4, 10, 50]
-    row = 1000
-    default_times = [time_report(make_report, col, row) for col in cols]
-    minimal_times = [time_report(make_report_minimal, col, row) for col in cols]
-
-    ax1 = sns.scatterplot(cols, default_times)
-    ax2 = sns.scatterplot(cols, minimal_times)
-    _ = ax1.set(
-        xlabel=f"Number of columns (row={row})",
-        ylabel="time (s)",
-        title="Run Time Complexity",
-    )
-    plt.show()
-
-
-def plot_row_run_time():
-    # 10, 100
-    # https://github.com/pandas-profiling/pandas-profiling/issues/270
-    rows = [1000, 10000, 100000]
-    col = 10
-    default_times = [time_report(make_report, col, row) for row in rows]
-    minimal_times = [time_report(make_report_minimal, col, row) for row in rows]
-
-    ax1 = sns.scatterplot(rows, default_times)
-    ax2 = sns.scatterplot(rows, minimal_times)
-    _ = ax1.set(
-        xlabel=f"Number of rows (col={col})",
-        ylabel="time (s)",
-        title="Run Time Complexity",
-    )
-    plt.show()
-
-
-if __name__ == "__main__":
-    plot_col_run_time()
-    plot_row_run_time()