Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 71 additions & 45 deletions src/mcgrad/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1155,45 +1155,6 @@ def kuiper_calibration_per_segment(
)


def kuiper_calibration(
labels: npt.NDArray,
predicted_scores: npt.NDArray,
sample_weight: npt.NDArray | None = None,
normalization_method: str | None = None,
segments: npt.NDArray | None = None,
precision_dtype: type[np.float16]
| type[np.float32]
| type[np.float64] = DEFAULT_PRECISION_DTYPE,
) -> float:
"""
Calculates Kuiper calibration distance between responses and scores.

For details, see:
Mark Tygert. (2024, January 10). Conditioning on and controlling for
variates via cumulative differences: measuring calibration, reliability,
biases, and other treatment effects. Zenodo.
https://doi.org/10.5281/zenodo.10481097

:param labels: Array of binary labels (0 or 1)
:param predicted_scores: Array of predicted probability scores (floats between 0 and 1)
:param sample_weight: Optional array of sample weights (non-negative floats)
:param normalization_method: Optional method name for calculating a normalization constant.
See kuiper_standard_deviation_per_segment or kuiper_upper_bound_standard_deviation_per_segment.
:param segments: Optional array of segments to parallelize the computation of the kuiper calibration distance.
:param precision_dtype: Data type for precision of computation. Defaults to np.float64.
:return: Kuiper calibration distance
"""

return kuiper_calibration_per_segment(
labels,
predicted_scores,
sample_weight,
normalization_method,
segments,
precision_dtype,
).item()


def kuiper_distribution(x: float) -> float:
"""
Evaluates the cumulative distribution function for the range
Expand Down Expand Up @@ -1262,12 +1223,7 @@ def kuiper_test(
:return: A tuple containing the Kuiper statistic and the corresponding p-value.
"""

kuiper_stat = kuiper_calibration(
labels,
predicted_scores,
sample_weight,
normalization_method="kuiper_standard_deviation",
)
kuiper_stat = ecce_sigma(labels, predicted_scores, sample_weight)
if kuiper_stat < KUIPER_STATISTIC_MIN:
pval = 1.0
elif kuiper_stat > KUIPER_STATISTIC_MAX:
Expand Down Expand Up @@ -1318,6 +1274,76 @@ def kuiper_pvalue(
)[1]


def ecce(
labels: npt.NDArray,
predicted_scores: npt.NDArray,
sample_weight: npt.NDArray | None = None,
) -> float:
"""
Calculate the Expected Cumulative Calibration Error (ECCE) [1].

ECCE measures the maximum deviation between the cumulative distribution of
predicted probabilities for positive and negative examples. It is equivalent
to the unnormalized Kuiper calibration statistic.

[1]: Arrieta-Ibarra, I., Gujral, P., Tannen, J., Tygert, M., & Xu, C. (2022).
Metrics of calibration for probabilistic predictions. Journal of Machine
Learning Research, 23(351), 1-54. (https://tygert.com/ece.pdf)

:param labels: Array of true binary labels (0 or 1).
:param predicted_scores: Array of predicted probabilities.
:param sample_weight: Optional array of sample weights.
:return: The ECCE value.
"""
return kuiper_calibration_per_segment(
labels, predicted_scores, sample_weight, normalization_method=None
).item()


def ecce_sigma(
labels: npt.NDArray,
predicted_scores: npt.NDArray,
sample_weight: npt.NDArray | None = None,
) -> float:
"""
Calculate the ECCE normalized by standard deviation.

This returns the ECCE statistic normalized by the standard deviation of the
calibration error under the null hypothesis of perfect calibration.

:param labels: Array of true binary labels (0 or 1).
:param predicted_scores: Array of predicted probabilities.
:param sample_weight: Optional array of sample weights.
:return: The normalized ECCE value.
"""
return kuiper_calibration_per_segment(
labels,
predicted_scores,
sample_weight,
normalization_method="kuiper_standard_deviation",
).item()


def ecce_pvalue(
labels: npt.NDArray,
predicted_scores: npt.NDArray,
sample_weight: npt.NDArray | None = None,
) -> float:
"""
Calculate the p-value for the ECCE statistic.

Tests the null hypothesis that predictions are perfectly calibrated using
the Kuiper test.

:param labels: Array of true binary labels (0 or 1).
:param predicted_scores: Array of predicted probabilities.
:param sample_weight: Optional array of sample weights.
:return: The p-value from the calibration test.
"""
_, pvalue = kuiper_test(labels, predicted_scores, sample_weight)
return pvalue


def kuiper_func_per_segment(
labels: npt.NDArray,
predictions: npt.NDArray,
Expand Down
57 changes: 25 additions & 32 deletions src/mcgrad/tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,13 +539,11 @@ def test_fpr():
([1.0, 1.0, 1.0, 1.0], [0, 1, 0, 0], None, 0.75),
],
)
def test_kuiper_calibration_gives_expected_result(
scores, labels, sample_weight, expected_result
):
def test_ecce_gives_expected_result(scores, labels, sample_weight, expected_result):
scores, labels = np.array(scores), np.array(labels)
if sample_weight is not None:
sample_weight = np.array(sample_weight)
calibration_metric = metrics.kuiper_calibration(labels, scores, sample_weight)
calibration_metric = metrics.ecce(labels, scores, sample_weight)

# Check that the offset is correctly calculated
np.testing.assert_allclose(calibration_metric, expected_result)
Expand All @@ -560,18 +558,13 @@ def test_kuiper_calibration_gives_expected_result(
([0.6, 0.8, 0.2, 0.4], [0, 1, 0, 0], [0.4, 0.3, 0.2, 0.1], 1.289317),
],
)
def test_kuiper_calibration_standardized_gives_expected_result(
def test_ecce_sigma_gives_expected_result(
scores, labels, sample_weight, expected_result
):
scores, labels = np.array(scores), np.array(labels)
if sample_weight is not None:
sample_weight = np.array(sample_weight)
calibration_metric = metrics.kuiper_calibration(
labels,
scores,
sample_weight,
normalization_method="kuiper_standard_deviation",
)
calibration_metric = metrics.ecce_sigma(labels, scores, sample_weight)

# Check that the offset is correctly calculated
np.testing.assert_allclose(calibration_metric, expected_result, atol=1e-5)
Expand All @@ -588,18 +581,13 @@ def test_kuiper_calibration_standardized_gives_expected_result(
([1.0, 1.0, 1.0, 1.0], [0, 0, 0, 0], None, np.inf),
],
)
def test_kuiper_calibration_standardized_gives_expected_result_for_scores_resulting_in_zero_kuiper_stat_variance(
def test_ecce_sigma_gives_expected_result_for_scores_resulting_in_zero_variance(
scores, labels, sample_weight, expected_result
):
scores, labels = np.array(scores), np.array(labels)
if sample_weight is not None:
sample_weight = np.array(sample_weight)
calibration_metric = metrics.kuiper_calibration(
labels,
scores,
sample_weight,
normalization_method="kuiper_standard_deviation",
)
calibration_metric = metrics.ecce_sigma(labels, scores, sample_weight)

# Check that the offset is correctly calculated
np.testing.assert_allclose(calibration_metric, expected_result)
Expand Down Expand Up @@ -1009,10 +997,9 @@ def test_that_multicalibrationerror_is_equal_to_ecce_metric_on_single_segment():
}
)

global_kuiper_metric = metrics.kuiper_calibration(
global_ecce_metric = metrics.ecce(
labels=test_df.label.values,
predicted_scores=test_df.prediction.values,
normalization_method=None,
)

mce = metrics.MulticalibrationError(
Expand All @@ -1024,7 +1011,7 @@ def test_that_multicalibrationerror_is_equal_to_ecce_metric_on_single_segment():
min_samples_per_segment=1,
sigma_estimation_method=None,
)
assert np.isclose(mce.mce_absolute, global_kuiper_metric, rtol=1e-10, atol=1e-10)
assert np.isclose(mce.mce_absolute, global_ecce_metric, rtol=1e-10, atol=1e-10)


@pytest.mark.parametrize(
Expand All @@ -1038,7 +1025,7 @@ def test_that_multicalibrationerror_is_equal_to_ecce_metric_on_single_segment():
metrics.calibration_ratio,
metrics.adaptive_calibration_error,
metrics.expected_calibration_error,
metrics.kuiper_calibration,
metrics.ecce,
],
)
def test_wrap_sklearn_metric_func_does_not_raise_an_error_with_any_of_our_main_metrics(
Expand Down Expand Up @@ -1723,18 +1710,16 @@ def test_kuiper_per_segment_does_not_modify_segments_df(rng, metric_func):
(metrics.expected_calibration_error, {}, False),
(metrics.calibration_ratio, {}, True),
(metrics.calibration_ratio, {}, False),
(
metrics.kuiper_calibration,
{"normalization_method": "kuiper_standard_deviation"},
True,
),
(
metrics.kuiper_calibration,
{"normalization_method": "kuiper_standard_deviation"},
False,
),
(metrics.ecce_sigma, {}, True),
(metrics.ecce_sigma, {}, False),
(metrics.normalized_entropy, {}, True),
(metrics.normalized_entropy, {}, False),
(metrics.ecce, {}, True),
(metrics.ecce, {}, False),
(metrics.ecce_sigma, {}, True),
(metrics.ecce_sigma, {}, False),
(metrics.ecce_pvalue, {}, True),
(metrics.ecce_pvalue, {}, False),
],
)
def test_metric_does_not_modify_input_arrays(
Expand Down Expand Up @@ -2611,3 +2596,11 @@ def test_ndcg_score_returns_nan_on_empty_arrays():
predicted_labels = np.array([])
result = metrics.ndcg_score(labels, predicted_labels)
assert np.isnan(result), f"Expected NaN for empty arrays, got {result}"


def test_ecce_pvalue_consistency_with_kuiper_pvalue(rng):
labels = rng.randint(0, 2, 100)
predicted_scores = rng.rand(100)
ecce_pvalue_result = metrics.ecce_pvalue(labels, predicted_scores)
_, kuiper_pvalue_result = metrics.kuiper_test(labels, predicted_scores)
assert ecce_pvalue_result == pytest.approx(kuiper_pvalue_result, rel=1e-10)