Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions examples/team_recommender/tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ def process_row(row: tuple[int, int, float]) -> str:
return f"{row[0]} failures out of {row[1]} is within {row[2] * 100:.0f}% success rate"


def is_statistically_significant(success_rate: float, failure_count: int, sample_size: int) -> bool:
return not is_within_expected(success_rate, failure_count, sample_size)


def is_within_expected(success_rate: float, failure_count: int, sample_size: int) -> bool:
print(f"is_within_expected({success_rate}, {failure_count}, {sample_size})")
if sample_size <= 1:
Expand Down
117 changes: 93 additions & 24 deletions examples/team_recommender/tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
_assert_success_rate,
failures_within_margin_of_error_from_expected,
generate_examples,
is_statistically_significant,
is_within_expected,
natural_sort_key,
)
Expand Down Expand Up @@ -142,12 +143,12 @@ def test_seventy_percent_confidence_ranges_from_fifty_to_ninety():


def next_success_rate(sample_size) -> float:
return 1 - 1 / (sample_size + 1)
return sample_size / (sample_size + 1)


def test_next_success_rate():
assert next_success_rate(1) == 0.5
assert next_success_rate(2) == 0.6666666666666667
assert next_success_rate(2) == pytest.approx(0.6667, rel=0.01)
assert next_success_rate(3) == 0.75
assert next_success_rate(4) == 0.8
assert next_success_rate(10) == 0.9090909090909091
Expand All @@ -173,43 +174,96 @@ def test_largest_sample_size_for_given_success_rate(success_rate, largest_sample
)


def test_next_sample_size():
## Next sample size should be larger than the current one by at least 4 times
assert next_sample_size(10) == 45, (
"passing 10 out of 10 should require 45 successful runs to be statistically significant"
)
assert next_sample_size(45) == 185, (
"passing 45 out of 45 should require 185 successful runs to be statistically significant"
@pytest.mark.parametrize(
"sample_size, expected",
[
(10, 45),
(45, 185),
(185, 745),
(745, 2985),
(2985, 11945), # 1/11945=0.00008372 = 99.99%
],
)
def test_next_sample_size_with_1_failure(sample_size, expected):
assert next_sample_size_with_1_failure(sample_size) == expected


def test_next_sample_size_via_loop_with_1_failure():
assert next_sample_size_with_1_failure(29) == next_sample_size_via_loop_with_1_failure(29), (
"calculated via loop should match"
)
assert next_sample_size(185) == 745
assert next_sample_size(745) == 2985
assert next_sample_size(29) == 121
assert next_sample_size(29) == next_sample_size_via_loop(29), "calculated via loop should match"

assert 28 / 29 == pytest.approx(0.96, rel=0.01)
before = analyse_measure_from_test_sample(28, 29)
assert before.proportion == pytest.approx(0.96, rel=0.01)
assert before.confidence_interval_prop == pytest.approx((0.91, 1.00), 0.01)

analysis = analyse_measure_from_test_sample(120, 121)
assert analysis.proportion == pytest.approx(0.99, rel=0.01)
assert analysis.confidence_interval_prop == pytest.approx((0.98, 1.00), 0.01)
def test_next_success_after_29_runs_is_121():
starting_runs = 29
starting_success_rate = (starting_runs - 1) / starting_runs
starting_analysis = analyse_measure_from_test_sample(starting_runs - 1, starting_runs)

assert starting_analysis.proportion == pytest.approx(starting_success_rate)

def next_sample_size(current):
next_size = next_sample_size_with_1_failure(starting_runs)
assert next_size == 121, "should be 121"
next_analysis = analyse_measure_from_test_sample(next_size - 1, next_size)

assert next_analysis.proportion == pytest.approx(next_success_rate(next_size), rel=0.0001), (
"analysis proportion should match next rate"
)


def next_sample_size_with_1_failure(sample_size: int) -> int:
## How many successful runs are needed to be statistically significant improvement
# compared to the current sample size with 100% success rate
return 4 * current + 5
# compared to the current sample size with 100% success rate at 90% confidence
return 4 * sample_size + 5


def next_sample_size_via_loop(sample_size: int) -> int:
def next_sample_size_via_loop_with_1_failure(sample_size: int) -> int:
goal_success_rate = next_success_rate(sample_size)
for i in range(sample_size, 5 * sample_size):
if not is_within_expected(goal_success_rate, 1, i):
return i
return 0


def next_sample_size_via_loop_no_failure(sample_size: int) -> int:
goal_success_rate = next_success_rate(sample_size)
for i in range(sample_size, 5 * sample_size):
if not is_within_expected(goal_success_rate, 0, i):
return i
return 0


def next_sample_size_no_failure(sample_size: int) -> int:
return 2 * sample_size + 3


@pytest.mark.parametrize(
"sample_size, expected",
[
(10, 45),
(45, 185),
(185, 745),
(745, 2985),
(29, 121),
],
)
def test_next_sample_size_via_loop(sample_size, expected):
assert next_sample_size_via_loop_with_1_failure(sample_size) == expected


@pytest.mark.parametrize(
"sample_size, expected",
[
(10, 23),
(23, 49),
(49, 101),
(101, 205),
(205, 413),
],
)
def test_next_no_failure_sample_size_via_loop(sample_size, expected):
assert next_sample_size_via_loop_no_failure(sample_size) == expected


def test_success_rate():
tiny_set_analysis = analyse_measure_from_test_sample(1, 2)
assert tiny_set_analysis.proportion == 0.5
Expand Down Expand Up @@ -257,3 +311,18 @@ def test_sort_names_with_numbers():
assert sorted([Path(p) for p in unsorted], key=natural_sort_key) == [
Path(p) for p in correctly_sorted
], "example_10_threshold should be last, while example_1_text_response should be first"


def test_example_on_wiki():
sample_size = 47
success_rate = 0.950
assert not is_statistically_significant(success_rate, 1, sample_size)
next_rate = next_success_rate(sample_size)
next_size = next_sample_size_no_failure(sample_size)
assert next_sample_size_via_loop_with_1_failure(sample_size) == 193
assert next_size == 97
assert next_rate == pytest.approx(0.98, rel=0.01)

assert not is_within_expected(0.95, 1, next_size)
assert not is_within_expected(next_rate, 0, next_size)
assert is_within_expected(next_rate, 1, next_size)
108 changes: 0 additions & 108 deletions examples/team_recommender/tests/test_proportions_ztest.py

This file was deleted.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ test = [
"pytest-asyncio>=0.21.0,<0.22",
"mypy>=1.8.0,<2",
"pytest-snapshot>=0.9.0",
"statsmodels>=0.14.4",
]
examples = ["openai>=1.63.2,<2", "python-dotenv>=1.0.1,<2"]
dev = [
Expand Down
Loading