Skip to content

Commit 6209002

Browse files
committed
Add benchmark for median
1 parent 347cab2 commit 6209002

File tree

2 files changed

+174
-0
lines changed

2 files changed

+174
-0
lines changed

benchmark_median.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Benchmark comparisons between the built‐in statistics.median_low function
4+
and the selectlib selection functions (nth_element, quickselect, and heapselect)
5+
for computing the median (low median for even lengths) of a list.
6+
7+
For each list size (ranging from 1,000 to 1,000,000 elements),
8+
the script generates a random list of integers. For each method, the test is run 5 times
9+
and the median runtime (in seconds) is recorded.
10+
11+
Methods benchmarked:
12+
1. median_low – Uses statistics.median_low to compute the median.
13+
2. nth_element – Uses selectlib.nth_element to partition the list so that the median element is positioned correctly.
14+
3. quickselect – Uses selectlib.quickselect for the median selection.
15+
4. heapselect – Uses selectlib.heapselect for the median selection.
16+
17+
The results are then displayed as a grouped bar chart with one group per list size.
18+
"""
19+
20+
import random
21+
import timeit
22+
import statistics
23+
import matplotlib.pyplot as plt
24+
import selectlib # our C extension module
25+
import statistics as stats
26+
27+
# ---------------------------------------------------------------------------
28+
# Benchmark method definitions
29+
# Each method gets a copy of the original list and computes the median (low)
30+
# using the corresponding approach.
31+
# The median index is computed as (n-1)//2.
32+
# ---------------------------------------------------------------------------
33+
34+
def bench_median_low(values):
35+
"""
36+
Uses the built‐in statistics.median_low function.
37+
"""
38+
lst = values.copy()
39+
# statistics.median_low returns the median (for even-length lists, the lower of the two)
40+
return stats.median_low(lst)
41+
42+
def bench_nth_element(values):
43+
"""
44+
Uses selectlib.nth_element to repartition the list so that the median is at index (n-1)//2.
45+
After partitioning, the median is obtained directly.
46+
"""
47+
lst = values.copy()
48+
n = len(lst)
49+
median_index = (n - 1) // 2
50+
selectlib.nth_element(lst, median_index)
51+
return lst[median_index]
52+
53+
def bench_quickselect(values):
54+
"""
55+
Uses selectlib.quickselect to reposition the median element in the list.
56+
"""
57+
lst = values.copy()
58+
n = len(lst)
59+
median_index = (n - 1) // 2
60+
selectlib.quickselect(lst, median_index)
61+
return lst[median_index]
62+
63+
def bench_heapselect(values):
64+
"""
65+
Uses selectlib.heapselect to reposition the median element in the list.
66+
"""
67+
lst = values.copy()
68+
n = len(lst)
69+
median_index = (n - 1) // 2
70+
selectlib.heapselect(lst, median_index)
71+
return lst[median_index]
72+
73+
# Dictionary of methods to benchmark.
74+
methods = {
75+
"median_low": bench_median_low,
76+
"nth_element": bench_nth_element,
77+
"quickselect": bench_quickselect,
78+
"heapselect" : bench_heapselect,
79+
}
80+
81+
# ---------------------------------------------------------------------------
82+
# Benchmark runner
83+
# ---------------------------------------------------------------------------
84+
def run_benchmarks():
85+
"""
86+
Runs the benchmarks for various list sizes.
87+
For each list size N (from 1,000 to 1,000,000), a random list of integers is generated.
88+
For each method, the benchmark calls the method 5 times (using timeit.repeat)
89+
and the median runtime is recorded.
90+
Returns a dictionary mapping each list size to its benchmark results.
91+
"""
92+
# List sizes to test
93+
N_values = [1000, 10_000, 100_000, 1_000_000]
94+
95+
overall_results = {} # {N: { method: time_in_seconds, ... } }
96+
97+
for N in N_values:
98+
print(f"\nBenchmarking for N = {N:,} (median index = {(N-1)//2:,})")
99+
# Generate a random list of integers
100+
original = [random.randint(0, 1_000_000) for _ in range(N)]
101+
102+
results = {}
103+
for name, func in methods.items():
104+
# Prepare a callable that calls the method for the given list
105+
test_callable = lambda: func(original)
106+
# Run 5 times
107+
times = timeit.repeat(stmt=test_callable, repeat=5, number=1)
108+
med_time = statistics.median(times)
109+
results[name] = med_time
110+
times_ms = [f"{t*1000:,.3f}" for t in times] # format as milliseconds
111+
print(f" {name:12}: median = {med_time*1000:,.3f} ms (runs: {times_ms})")
112+
overall_results[N] = results
113+
return overall_results
114+
115+
# ---------------------------------------------------------------------------
116+
# Plotting results
117+
# ---------------------------------------------------------------------------
118+
def plot_results(results):
119+
"""
120+
Creates a grouped bar chart.
121+
Each group corresponds to a different list size N.
122+
Each bar in a group shows the median runtime (in ms) for a given method.
123+
"""
124+
# Get the list sizes and sort them
125+
N_values = sorted(results.keys())
126+
num_groups = len(N_values)
127+
128+
# Method ordering and colors (similar to benchmark.py)
129+
methods_order = ["median_low", "nth_element", "quickselect", "heapselect"]
130+
method_colors = {
131+
"median_low": '#1f77b4',
132+
"nth_element": '#ff7f0e',
133+
"quickselect": '#2ca02c',
134+
"heapselect": '#d62728',
135+
}
136+
137+
# X positions for the groups
138+
group_positions = list(range(num_groups))
139+
140+
# Bar appearance settings
141+
bar_width = 0.18
142+
offsets = {
143+
"median_low": -1.5*bar_width,
144+
"nth_element": -0.5*bar_width,
145+
"quickselect": 0.5*bar_width,
146+
"heapselect": 1.5*bar_width,
147+
}
148+
149+
plt.figure(figsize=(10, 6))
150+
151+
# For each method, plot a bar for each list size
152+
for method in methods_order:
153+
times_ms = [results[N][method]*1000 for N in N_values]
154+
positions = [pos + offsets[method] for pos in group_positions]
155+
bars = plt.bar(positions, times_ms, width=bar_width, label=method, color=method_colors.get(method))
156+
plt.bar_label(bars, fmt='%.2f', padding=3, fontsize=8)
157+
158+
# Configure x-axis with list sizes (formatted with commas)
159+
plt.xticks(group_positions, [f"{N:,}" for N in N_values])
160+
plt.xlabel("List size (N)")
161+
plt.ylabel("Time (ms)")
162+
plt.title("Benchmark: statistics.median_low vs. selectlib selection methods (median)")
163+
plt.legend(title="Method")
164+
plt.grid(True, linestyle='--', alpha=0.5)
165+
plt.tight_layout()
166+
plt.savefig("plot_median.png")
167+
plt.show()
168+
169+
# ---------------------------------------------------------------------------
170+
# Main
171+
# ---------------------------------------------------------------------------
172+
if __name__ == '__main__':
173+
bench_results = run_benchmarks()
174+
plot_results(bench_results)

plot_median.png

43.1 KB
Loading

0 commit comments

Comments
 (0)