thisisartium · paulz · Mar 25, 2025 · Mar 25, 2025 · Mar 25, 2025 · Mar 25, 2025
diff --git a/examples/team_recommender/tests/helpers.py b/examples/team_recommender/tests/helpers.py
@@ -79,6 +79,10 @@ def process_row(row: tuple[int, int, float]) -> str:
     return f"{row[0]} failures out of {row[1]} is within {row[2] * 100:.0f}% success rate"
 
 
+def is_statistically_significant(success_rate: float, failure_count: int, sample_size: int) -> bool:
+    return not is_within_expected(success_rate, failure_count, sample_size)
+
+
 def is_within_expected(success_rate: float, failure_count: int, sample_size: int) -> bool:
     print(f"is_within_expected({success_rate}, {failure_count}, {sample_size})")
     if sample_size <= 1:

diff --git a/examples/team_recommender/tests/test_helpers.py b/examples/team_recommender/tests/test_helpers.py
@@ -5,6 +5,7 @@
     _assert_success_rate,
     failures_within_margin_of_error_from_expected,
     generate_examples,
+    is_statistically_significant,
     is_within_expected,
     natural_sort_key,
 )
@@ -142,12 +143,12 @@ def test_seventy_percent_confidence_ranges_from_fifty_to_ninety():
 
 
 def next_success_rate(sample_size) -> float:
-    return 1 - 1 / (sample_size + 1)
+    return sample_size / (sample_size + 1)
 
 
 def test_next_success_rate():
     assert next_success_rate(1) == 0.5
-    assert next_success_rate(2) == 0.6666666666666667
+    assert next_success_rate(2) == pytest.approx(0.6667, rel=0.01)
     assert next_success_rate(3) == 0.75
     assert next_success_rate(4) == 0.8
     assert next_success_rate(10) == 0.9090909090909091
@@ -173,43 +174,96 @@ def test_largest_sample_size_for_given_success_rate(success_rate, largest_sample
     )
 
 
-def test_next_sample_size():
-    ## Next sample size should be larger than the current one by at least 4 times
-    assert next_sample_size(10) == 45, (
-        "passing 10 out of 10 should require 45 successful runs to be statistically significant"
-    )
-    assert next_sample_size(45) == 185, (
-        "passing 45 out of 45 should require 185 successful runs to be statistically significant"
+@pytest.mark.parametrize(
+    "sample_size, expected",
+    [
+        (10, 45),
+        (45, 185),
+        (185, 745),
+        (745, 2985),
+        (2985, 11945),  # 1/11945=0.00008372 = 99.99%
+    ],
+)
+def test_next_sample_size_with_1_failure(sample_size, expected):
+    assert next_sample_size_with_1_failure(sample_size) == expected
+
+
+def test_next_sample_size_via_loop_with_1_failure():
+    assert next_sample_size_with_1_failure(29) == next_sample_size_via_loop_with_1_failure(29), (
+        "calculated via loop should match"
     )
-    assert next_sample_size(185) == 745
-    assert next_sample_size(745) == 2985
-    assert next_sample_size(29) == 121
-    assert next_sample_size(29) == next_sample_size_via_loop(29), "calculated via loop should match"
 
-    assert 28 / 29 == pytest.approx(0.96, rel=0.01)
-    before = analyse_measure_from_test_sample(28, 29)
-    assert before.proportion == pytest.approx(0.96, rel=0.01)
-    assert before.confidence_interval_prop == pytest.approx((0.91, 1.00), 0.01)
 
-    analysis = analyse_measure_from_test_sample(120, 121)
-    assert analysis.proportion == pytest.approx(0.99, rel=0.01)
-    assert analysis.confidence_interval_prop == pytest.approx((0.98, 1.00), 0.01)
+def test_next_success_after_29_runs_is_121():
+    starting_runs = 29
+    starting_success_rate = (starting_runs - 1) / starting_runs
+    starting_analysis = analyse_measure_from_test_sample(starting_runs - 1, starting_runs)
 
+    assert starting_analysis.proportion == pytest.approx(starting_success_rate)
 
-def next_sample_size(current):
+    next_size = next_sample_size_with_1_failure(starting_runs)
+    assert next_size == 121, "should be 121"
+    next_analysis = analyse_measure_from_test_sample(next_size - 1, next_size)
+
+    assert next_analysis.proportion == pytest.approx(next_success_rate(next_size), rel=0.0001), (
+        "analysis proportion should match next rate"
+    )
+
+
+def next_sample_size_with_1_failure(sample_size: int) -> int:
     ## How many successful runs are needed to be statistically significant improvement
-    # compared to the current sample size with 100% success rate
-    return 4 * current + 5
+    # compared to the current sample size with 100% success rate at 90% confidence
+    return 4 * sample_size + 5
 
 
-def next_sample_size_via_loop(sample_size: int) -> int:
+def next_sample_size_via_loop_with_1_failure(sample_size: int) -> int:
     goal_success_rate = next_success_rate(sample_size)
     for i in range(sample_size, 5 * sample_size):
         if not is_within_expected(goal_success_rate, 1, i):
             return i
     return 0
 
 
+def next_sample_size_via_loop_no_failure(sample_size: int) -> int:
+    goal_success_rate = next_success_rate(sample_size)
+    for i in range(sample_size, 5 * sample_size):
+        if not is_within_expected(goal_success_rate, 0, i):
+            return i
+    return 0
+
+
+def next_sample_size_no_failure(sample_size: int) -> int:
+    return 2 * sample_size + 3
+
+
+@pytest.mark.parametrize(
+    "sample_size, expected",
+    [
+        (10, 45),
+        (45, 185),
+        (185, 745),
+        (745, 2985),
+        (29, 121),
+    ],
+)
+def test_next_sample_size_via_loop(sample_size, expected):
+    assert next_sample_size_via_loop_with_1_failure(sample_size) == expected
+
+
+@pytest.mark.parametrize(
+    "sample_size, expected",
+    [
+        (10, 23),
+        (23, 49),
+        (49, 101),
+        (101, 205),
+        (205, 413),
+    ],
+)
+def test_next_no_failure_sample_size_via_loop(sample_size, expected):
+    assert next_sample_size_via_loop_no_failure(sample_size) == expected
+
+
 def test_success_rate():
     tiny_set_analysis = analyse_measure_from_test_sample(1, 2)
     assert tiny_set_analysis.proportion == 0.5
@@ -257,3 +311,18 @@ def test_sort_names_with_numbers():
     assert sorted([Path(p) for p in unsorted], key=natural_sort_key) == [
         Path(p) for p in correctly_sorted
     ], "example_10_threshold should be last, while example_1_text_response should be first"
+
+
+def test_example_on_wiki():
+    sample_size = 47
+    success_rate = 0.950
+    assert not is_statistically_significant(success_rate, 1, sample_size)
+    next_rate = next_success_rate(sample_size)
+    next_size = next_sample_size_no_failure(sample_size)
+    assert next_sample_size_via_loop_with_1_failure(sample_size) == 193
+    assert next_size == 97
+    assert next_rate == pytest.approx(0.98, rel=0.01)
+
+    assert not is_within_expected(0.95, 1, next_size)
+    assert not is_within_expected(next_rate, 0, next_size)
+    assert is_within_expected(next_rate, 1, next_size)
diff --git a/examples/team_recommender/tests/test_proportions_ztest.py b/examples/team_recommender/tests/test_proportions_ztest.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,6 @@ test = [
   "pytest-asyncio>=0.21.0,<0.22",
   "mypy>=1.8.0,<2",
   "pytest-snapshot>=0.9.0",
-  "statsmodels>=0.14.4",
 ]
 examples = ["openai>=1.63.2,<2", "python-dotenv>=1.0.1,<2"]
 dev = [