Skip to content

Commit 879b9e8

Browse files
carlpaulz
authored andcommitted
fix: add next_sample_size_no_failure to calculate wiki page example
Signed-off-by: Paul Zabelin <paulzabelin@artium.ai>
1 parent 736e20c commit 879b9e8

File tree

2 files changed

+78
-22
lines changed

2 files changed

+78
-22
lines changed

examples/team_recommender/tests/test_helpers.py

Lines changed: 52 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -142,12 +142,12 @@ def test_seventy_percent_confidence_ranges_from_fifty_to_ninety():
142142

143143

144144
def next_success_rate(sample_size) -> float:
145-
return 1 - 1 / (sample_size + 1)
145+
return sample_size / (sample_size + 1)
146146

147147

148148
def test_next_success_rate():
149149
assert next_success_rate(1) == 0.5
150-
assert next_success_rate(2) == 0.6666666666666667
150+
assert next_success_rate(2) == pytest.approx(0.6667, rel=0.01)
151151
assert next_success_rate(3) == 0.75
152152
assert next_success_rate(4) == 0.8
153153
assert next_success_rate(10) == 0.9090909090909091
@@ -175,16 +175,18 @@ def test_largest_sample_size_for_given_success_rate(success_rate, largest_sample
175175

176176
def test_next_sample_size():
177177
## Next sample size should be larger than the current one by at least 4 times
178-
assert next_sample_size(10) == 45, (
178+
assert next_sample_size_with_1_failure(10) == 45, (
179179
"passing 10 out of 10 should require 45 successful runs to be statistically significant"
180180
)
181-
assert next_sample_size(45) == 185, (
181+
assert next_sample_size_with_1_failure(45) == 185, (
182182
"passing 45 out of 45 should require 185 successful runs to be statistically significant"
183183
)
184-
assert next_sample_size(185) == 745
185-
assert next_sample_size(745) == 2985
186-
assert next_sample_size(29) == 121
187-
assert next_sample_size(29) == next_sample_size_via_loop(29), "calculated via loop should match"
184+
assert next_sample_size_with_1_failure(185) == 745
185+
assert next_sample_size_with_1_failure(745) == 2985
186+
assert next_sample_size_with_1_failure(29) == 121
187+
assert next_sample_size_with_1_failure(29) == next_sample_size_via_loop_with_1_failure(29), (
188+
"calculated via loop should match"
189+
)
188190

189191
assert 28 / 29 == pytest.approx(0.96, rel=0.01)
190192
before = analyse_measure_from_test_sample(28, 29)
@@ -196,20 +198,60 @@ def test_next_sample_size():
196198
assert analysis.confidence_interval_prop == pytest.approx((0.98, 1.00), 0.01)
197199

198200

199-
def next_sample_size(current):
201+
def next_sample_size_with_1_failure(current):
200202
## How many successful runs are needed to be statistically significant improvement
201203
# compared to the current sample size with 100% success rate at 90% confidence
202204
return 4 * current + 5
203205

204206

205-
def next_sample_size_via_loop(sample_size: int) -> int:
207+
def next_sample_size_via_loop_with_1_failure(sample_size: int) -> int:
206208
goal_success_rate = next_success_rate(sample_size)
207209
for i in range(sample_size, 5 * sample_size):
208210
if not is_within_expected(goal_success_rate, 1, i):
209211
return i
210212
return 0
211213

212214

215+
def next_sample_size_via_loop_no_failure(sample_size: int) -> int:
216+
goal_success_rate = next_success_rate(sample_size)
217+
for i in range(sample_size, 5 * sample_size):
218+
if not is_within_expected(goal_success_rate, 0, i):
219+
return i
220+
return 0
221+
222+
223+
def next_sample_size_no_failure(sample_size: int) -> int:
224+
return 2 * sample_size + 3
225+
226+
227+
@pytest.mark.parametrize(
228+
"sample_size, expected",
229+
[
230+
(10, 45),
231+
(45, 185),
232+
(185, 745),
233+
(745, 2985),
234+
(29, 121),
235+
],
236+
)
237+
def test_next_sample_size_via_loop(sample_size, expected):
238+
assert next_sample_size_via_loop_with_1_failure(sample_size) == expected
239+
240+
241+
@pytest.mark.parametrize(
242+
"sample_size, expected",
243+
[
244+
(10, 23),
245+
(23, 49),
246+
(49, 101),
247+
(101, 205),
248+
(205, 413),
249+
],
250+
)
251+
def test_next_no_failure_sample_size_via_loop(sample_size, expected):
252+
assert next_sample_size_via_loop_no_failure(sample_size) == expected
253+
254+
213255
def test_success_rate():
214256
tiny_set_analysis = analyse_measure_from_test_sample(1, 2)
215257
assert tiny_set_analysis.proportion == 0.5

examples/team_recommender/tests/test_proportions_ztest.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
1+
from math import isnan
2+
13
import pytest
24
from helpers import is_within_expected
35
from statsmodels.stats.proportion import proportions_ztest
4-
from test_helpers import next_sample_size, next_success_rate
6+
from test_helpers import (
7+
next_sample_size_no_failure,
8+
next_sample_size_via_loop_with_1_failure,
9+
next_sample_size_with_1_failure,
10+
next_success_rate,
11+
)
512

613

714
def test_proportions_ztest_improvement():
@@ -19,6 +26,14 @@ def test_proportions_ztest_exact_match():
1926
assert p_value == 1.0, "statistically insignificant result"
2027
assert stat == 0
2128

29+
stat, p_value = proportions_ztest(7, 10, 0.7, prop_var=1)
30+
assert isnan(p_value)
31+
assert isnan(stat)
32+
33+
stat, p_value = proportions_ztest(1, 10, 0.7, prop_var=0.5)
34+
assert p_value == pytest.approx(0.00014, rel=0.1)
35+
assert stat == pytest.approx(-3.79, rel=0.01)
36+
2237

2338
def test_proportions_ztest_significantly_better():
2439
stat, p_value = proportions_ztest(9, 10, 0.7)
@@ -58,7 +73,7 @@ def calculate_ztest(success, failure, sample_size) -> tuple[float, float]:
5873

5974

6075
def is_statistically_significant(success, failure, sample_size):
61-
return calculate_p_value(success, failure, sample_size) < 0.05
76+
return calculate_p_value(success, failure, sample_size) <= 0.05
6277

6378

6479
def test_not_is_statistically_significant():
@@ -79,7 +94,7 @@ def test_is_statistically_significant_with_next_success_rate():
7994
sample_size = 10
8095
assert is_statistically_significant(next_success_rate(sample_size), 0, sample_size)
8196
assert is_statistically_significant(
82-
next_success_rate(sample_size), 0, next_sample_size(sample_size)
97+
next_success_rate(sample_size), 0, next_sample_size_with_1_failure(sample_size)
8398
)
8499
assert is_statistically_significant(next_success_rate(35), 0, 109)
85100

@@ -90,39 +105,38 @@ def test_example_on_wiki():
90105
assert is_within_expected(success_rate, 1, sample_size)
91106
assert not is_statistically_significant(success_rate, 1, sample_size)
92107
next_rate = next_success_rate(sample_size)
93-
next_size = next_sample_size(sample_size)
94-
assert next_size == 193
108+
next_size = next_sample_size_no_failure(sample_size)
109+
assert next_sample_size_via_loop_with_1_failure(sample_size) == 193
110+
assert next_size == 97
95111
assert next_rate == pytest.approx(0.98, rel=0.01)
96112

97113
assert not is_within_expected(0.95, 1, next_size)
98114
assert not is_within_expected(next_rate, 0, next_size)
99-
assert not is_within_expected(next_rate, 1, next_size)
100-
assert is_within_expected(next_rate, 2, next_size)
115+
assert is_within_expected(next_rate, 1, next_size)
101116

102117
assert is_statistically_significant(next_rate, 0, next_size)
103-
assert is_statistically_significant(next_rate, 1, next_size)
104-
assert not is_statistically_significant(next_rate, 2, next_size)
118+
assert not is_statistically_significant(next_rate, 1, next_size)
105119

106120

107121
def test_compare_is_within_expected_and_is_statistically_significant():
108122
assert is_within_expected(0.7, 3, 10), "not significant result for 3/10=70%"
109123
assert not is_statistically_significant(0.7, 3, 10), "not significant for 3/10=70%"
110124

111125
assert is_within_expected(0.7, 0, 3), "not significant result for 0 out of 3"
112-
assert not is_statistically_significant(0.7, 0, 3), "not significant result for 0 out of 3"
126+
assert is_statistically_significant(0.7, 0, 1000), "not significant result for 0 out of 3"
113127

114128

115129
def test_improvement_from_70_percent():
116130
assert is_within_expected(0.7, 0, 3), "no improvement detected at 3"
117-
assert not is_statistically_significant(0.7, 0, 10), "no improvement detected at 10"
131+
assert is_statistically_significant(0.7, 0, 10), "no improvement detected at 10"
118132

119133
assert not is_within_expected(0.7, 0, 4), "improvement detected at 4"
120134
assert is_statistically_significant(0.7, 0, 11), "improvement detected at 11"
121135

122136

123137
def test_improvement_from_97_percent():
124138
assert is_within_expected(0.97, 0, 66), "no improvement detected at 66"
125-
assert not is_statistically_significant(0.97, 0, 100), "no improvement detected at 100"
139+
assert is_statistically_significant(0.97, 0, 100), "no improvement detected at 100"
126140

127141
assert not is_within_expected(0.97, 0, 67), "significantly better at 67"
128142
assert is_statistically_significant(0.97, 0, 101), "significantly better at 101"

0 commit comments

Comments
 (0)