|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Benchmark comparisons between the built‐in statistics.median_low function |
| 4 | +and the selectlib selection functions (nth_element, quickselect, and heapselect) |
| 5 | +for computing the median (low median for even lengths) of a list. |
| 6 | +
|
| 7 | +For each list size (ranging from 1,000 to 1,000,000 elements), |
| 8 | +the script generates a random list of integers. For each method, the test is run 5 times |
| 9 | +and the median runtime (in seconds) is recorded. |
| 10 | +
|
| 11 | +Methods benchmarked: |
| 12 | + 1. median_low – Uses statistics.median_low to compute the median. |
| 13 | + 2. nth_element – Uses selectlib.nth_element to partition the list so that the median element is positioned correctly. |
| 14 | + 3. quickselect – Uses selectlib.quickselect for the median selection. |
| 15 | + 4. heapselect – Uses selectlib.heapselect for the median selection. |
| 16 | +
|
| 17 | +The results are then displayed as a grouped bar chart with one group per list size. |
| 18 | +""" |
| 19 | + |
| 20 | +import random |
| 21 | +import timeit |
| 22 | +import statistics |
| 23 | +import matplotlib.pyplot as plt |
| 24 | +import selectlib # our C extension module |
| 25 | +import statistics as stats |
| 26 | + |
| 27 | +# --------------------------------------------------------------------------- |
| 28 | +# Benchmark method definitions |
| 29 | +# Each method gets a copy of the original list and computes the median (low) |
| 30 | +# using the corresponding approach. |
| 31 | +# The median index is computed as (n-1)//2. |
| 32 | +# --------------------------------------------------------------------------- |
| 33 | + |
| 34 | +def bench_median_low(values): |
| 35 | + """ |
| 36 | + Uses the built‐in statistics.median_low function. |
| 37 | + """ |
| 38 | + lst = values.copy() |
| 39 | + # statistics.median_low returns the median (for even-length lists, the lower of the two) |
| 40 | + return stats.median_low(lst) |
| 41 | + |
| 42 | +def bench_nth_element(values): |
| 43 | + """ |
| 44 | + Uses selectlib.nth_element to repartition the list so that the median is at index (n-1)//2. |
| 45 | + After partitioning, the median is obtained directly. |
| 46 | + """ |
| 47 | + lst = values.copy() |
| 48 | + n = len(lst) |
| 49 | + median_index = (n - 1) // 2 |
| 50 | + selectlib.nth_element(lst, median_index) |
| 51 | + return lst[median_index] |
| 52 | + |
| 53 | +def bench_quickselect(values): |
| 54 | + """ |
| 55 | + Uses selectlib.quickselect to reposition the median element in the list. |
| 56 | + """ |
| 57 | + lst = values.copy() |
| 58 | + n = len(lst) |
| 59 | + median_index = (n - 1) // 2 |
| 60 | + selectlib.quickselect(lst, median_index) |
| 61 | + return lst[median_index] |
| 62 | + |
| 63 | +def bench_heapselect(values): |
| 64 | + """ |
| 65 | + Uses selectlib.heapselect to reposition the median element in the list. |
| 66 | + """ |
| 67 | + lst = values.copy() |
| 68 | + n = len(lst) |
| 69 | + median_index = (n - 1) // 2 |
| 70 | + selectlib.heapselect(lst, median_index) |
| 71 | + return lst[median_index] |
| 72 | + |
| 73 | +# Dictionary of methods to benchmark. |
| 74 | +methods = { |
| 75 | + "median_low": bench_median_low, |
| 76 | + "nth_element": bench_nth_element, |
| 77 | + "quickselect": bench_quickselect, |
| 78 | + "heapselect" : bench_heapselect, |
| 79 | +} |
| 80 | + |
| 81 | +# --------------------------------------------------------------------------- |
| 82 | +# Benchmark runner |
| 83 | +# --------------------------------------------------------------------------- |
| 84 | +def run_benchmarks(): |
| 85 | + """ |
| 86 | + Runs the benchmarks for various list sizes. |
| 87 | + For each list size N (from 1,000 to 1,000,000), a random list of integers is generated. |
| 88 | + For each method, the benchmark calls the method 5 times (using timeit.repeat) |
| 89 | + and the median runtime is recorded. |
| 90 | + Returns a dictionary mapping each list size to its benchmark results. |
| 91 | + """ |
| 92 | + # List sizes to test |
| 93 | + N_values = [1000, 10_000, 100_000, 1_000_000] |
| 94 | + |
| 95 | + overall_results = {} # {N: { method: time_in_seconds, ... } } |
| 96 | + |
| 97 | + for N in N_values: |
| 98 | + print(f"\nBenchmarking for N = {N:,} (median index = {(N-1)//2:,})") |
| 99 | + # Generate a random list of integers |
| 100 | + original = [random.randint(0, 1_000_000) for _ in range(N)] |
| 101 | + |
| 102 | + results = {} |
| 103 | + for name, func in methods.items(): |
| 104 | + # Prepare a callable that calls the method for the given list |
| 105 | + test_callable = lambda: func(original) |
| 106 | + # Run 5 times |
| 107 | + times = timeit.repeat(stmt=test_callable, repeat=5, number=1) |
| 108 | + med_time = statistics.median(times) |
| 109 | + results[name] = med_time |
| 110 | + times_ms = [f"{t*1000:,.3f}" for t in times] # format as milliseconds |
| 111 | + print(f" {name:12}: median = {med_time*1000:,.3f} ms (runs: {times_ms})") |
| 112 | + overall_results[N] = results |
| 113 | + return overall_results |
| 114 | + |
| 115 | +# --------------------------------------------------------------------------- |
| 116 | +# Plotting results |
| 117 | +# --------------------------------------------------------------------------- |
| 118 | +def plot_results(results): |
| 119 | + """ |
| 120 | + Creates a grouped bar chart. |
| 121 | + Each group corresponds to a different list size N. |
| 122 | + Each bar in a group shows the median runtime (in ms) for a given method. |
| 123 | + """ |
| 124 | + # Get the list sizes and sort them |
| 125 | + N_values = sorted(results.keys()) |
| 126 | + num_groups = len(N_values) |
| 127 | + |
| 128 | + # Method ordering and colors (similar to benchmark.py) |
| 129 | + methods_order = ["median_low", "nth_element", "quickselect", "heapselect"] |
| 130 | + method_colors = { |
| 131 | + "median_low": '#1f77b4', |
| 132 | + "nth_element": '#ff7f0e', |
| 133 | + "quickselect": '#2ca02c', |
| 134 | + "heapselect": '#d62728', |
| 135 | + } |
| 136 | + |
| 137 | + # X positions for the groups |
| 138 | + group_positions = list(range(num_groups)) |
| 139 | + |
| 140 | + # Bar appearance settings |
| 141 | + bar_width = 0.18 |
| 142 | + offsets = { |
| 143 | + "median_low": -1.5*bar_width, |
| 144 | + "nth_element": -0.5*bar_width, |
| 145 | + "quickselect": 0.5*bar_width, |
| 146 | + "heapselect": 1.5*bar_width, |
| 147 | + } |
| 148 | + |
| 149 | + plt.figure(figsize=(10, 6)) |
| 150 | + |
| 151 | + # For each method, plot a bar for each list size |
| 152 | + for method in methods_order: |
| 153 | + times_ms = [results[N][method]*1000 for N in N_values] |
| 154 | + positions = [pos + offsets[method] for pos in group_positions] |
| 155 | + bars = plt.bar(positions, times_ms, width=bar_width, label=method, color=method_colors.get(method)) |
| 156 | + plt.bar_label(bars, fmt='%.2f', padding=3, fontsize=8) |
| 157 | + |
| 158 | + # Configure x-axis with list sizes (formatted with commas) |
| 159 | + plt.xticks(group_positions, [f"{N:,}" for N in N_values]) |
| 160 | + plt.xlabel("List size (N)") |
| 161 | + plt.ylabel("Time (ms)") |
| 162 | + plt.title("Benchmark: statistics.median_low vs. selectlib selection methods (median)") |
| 163 | + plt.legend(title="Method") |
| 164 | + plt.grid(True, linestyle='--', alpha=0.5) |
| 165 | + plt.tight_layout() |
| 166 | + plt.savefig("plot_median.png") |
| 167 | + plt.show() |
| 168 | + |
| 169 | +# --------------------------------------------------------------------------- |
| 170 | +# Main |
| 171 | +# --------------------------------------------------------------------------- |
| 172 | +if __name__ == '__main__': |
| 173 | + bench_results = run_benchmarks() |
| 174 | + plot_results(bench_results) |
0 commit comments