facebookincubator · flinder · Jan 26, 2026 · Jan 26, 2026
@@ -1155,45 +1155,6 @@ def kuiper_calibration_per_segment(
         )
 
 
-def kuiper_calibration(
-    labels: npt.NDArray,
-    predicted_scores: npt.NDArray,
-    sample_weight: npt.NDArray | None = None,
-    normalization_method: str | None = None,
-    segments: npt.NDArray | None = None,
-    precision_dtype: type[np.float16]
-    | type[np.float32]
-    | type[np.float64] = DEFAULT_PRECISION_DTYPE,
-) -> float:
-    """
-    Calculates Kuiper calibration distance between responses and scores.
-
-    For details, see:
-    Mark Tygert. (2024, January 10). Conditioning on and controlling for
-    variates via cumulative differences: measuring calibration, reliability,
-    biases, and other treatment effects. Zenodo.
-    https://doi.org/10.5281/zenodo.10481097
-
-    :param labels: Array of binary labels (0 or 1)
-    :param predicted_scores: Array of predicted probability scores (floats between 0 and 1)
-    :param sample_weight: Optional array of sample weights (non-negative floats)
-    :param normalization_method: Optional method name for calculating a normalization constant.
-        See kuiper_standard_deviation_per_segment or kuiper_upper_bound_standard_deviation_per_segment.
-    :param segments: Optional array of segments to parallelize the computation of the kuiper calibration distance.
-    :param precision_dtype: Data type for precision of computation. Defaults to np.float64.
-    :return: Kuiper calibration distance
-    """
-
-    return kuiper_calibration_per_segment(
-        labels,
-        predicted_scores,
-        sample_weight,
-        normalization_method,
-        segments,
-        precision_dtype,
-    ).item()
-
-
 def kuiper_distribution(x: float) -> float:
     """
     Evaluates the cumulative distribution function for the range
@@ -1262,12 +1223,7 @@ def kuiper_test(
     :return: A tuple containing the Kuiper statistic and the corresponding p-value.
     """
 
-    kuiper_stat = kuiper_calibration(
-        labels,
-        predicted_scores,
-        sample_weight,
-        normalization_method="kuiper_standard_deviation",
-    )
+    kuiper_stat = ecce_sigma(labels, predicted_scores, sample_weight)
     if kuiper_stat < KUIPER_STATISTIC_MIN:
         pval = 1.0
     elif kuiper_stat > KUIPER_STATISTIC_MAX:
@@ -1318,6 +1274,76 @@ def kuiper_pvalue(
     )[1]
 
 
+def ecce(
+    labels: npt.NDArray,
+    predicted_scores: npt.NDArray,
+    sample_weight: npt.NDArray | None = None,
+) -> float:
+    """
+    Calculate the Expected Cumulative Calibration Error (ECCE) [1].
+
+    ECCE measures the maximum deviation between the cumulative distribution of
+    predicted probabilities for positive and negative examples. It is equivalent
+    to the unnormalized Kuiper calibration statistic.
+
+    [1]: Arrieta-Ibarra, I., Gujral, P., Tannen, J., Tygert, M., & Xu, C. (2022).
+    Metrics of calibration for probabilistic predictions. Journal of Machine
+    Learning Research, 23(351), 1-54. (https://tygert.com/ece.pdf)
+
+    :param labels: Array of true binary labels (0 or 1).
+    :param predicted_scores: Array of predicted probabilities.
+    :param sample_weight: Optional array of sample weights.
+    :return: The ECCE value.
+    """
+    return kuiper_calibration_per_segment(
+        labels, predicted_scores, sample_weight, normalization_method=None
+    ).item()
+
+
+def ecce_sigma(
+    labels: npt.NDArray,
+    predicted_scores: npt.NDArray,
+    sample_weight: npt.NDArray | None = None,
+) -> float:
+    """
+    Calculate the ECCE normalized by standard deviation.
+
+    This returns the ECCE statistic normalized by the standard deviation of the
+    calibration error under the null hypothesis of perfect calibration.
+
+    :param labels: Array of true binary labels (0 or 1).
+    :param predicted_scores: Array of predicted probabilities.
+    :param sample_weight: Optional array of sample weights.
+    :return: The normalized ECCE value.
+    """
+    return kuiper_calibration_per_segment(
+        labels,
+        predicted_scores,
+        sample_weight,
+        normalization_method="kuiper_standard_deviation",
+    ).item()
+
+
+def ecce_pvalue(
+    labels: npt.NDArray,
+    predicted_scores: npt.NDArray,
+    sample_weight: npt.NDArray | None = None,
+) -> float:
+    """
+    Calculate the p-value for the ECCE statistic.
+
+    Tests the null hypothesis that predictions are perfectly calibrated using
+    the Kuiper test.
+
+    :param labels: Array of true binary labels (0 or 1).
+    :param predicted_scores: Array of predicted probabilities.
+    :param sample_weight: Optional array of sample weights.
+    :return: The p-value from the calibration test.
+    """
+    _, pvalue = kuiper_test(labels, predicted_scores, sample_weight)
+    return pvalue
+
+
 def kuiper_func_per_segment(
     labels: npt.NDArray,
     predictions: npt.NDArray,

@@ -539,13 +539,11 @@ def test_fpr():
         ([1.0, 1.0, 1.0, 1.0], [0, 1, 0, 0], None, 0.75),
     ],
 )
-def test_kuiper_calibration_gives_expected_result(
-    scores, labels, sample_weight, expected_result
-):
+def test_ecce_gives_expected_result(scores, labels, sample_weight, expected_result):
     scores, labels = np.array(scores), np.array(labels)
     if sample_weight is not None:
         sample_weight = np.array(sample_weight)
-    calibration_metric = metrics.kuiper_calibration(labels, scores, sample_weight)
+    calibration_metric = metrics.ecce(labels, scores, sample_weight)
 
     # Check that the offset is correctly calculated
     np.testing.assert_allclose(calibration_metric, expected_result)
@@ -560,18 +558,13 @@ def test_kuiper_calibration_gives_expected_result(
         ([0.6, 0.8, 0.2, 0.4], [0, 1, 0, 0], [0.4, 0.3, 0.2, 0.1], 1.289317),
     ],
 )
-def test_kuiper_calibration_standardized_gives_expected_result(
+def test_ecce_sigma_gives_expected_result(
     scores, labels, sample_weight, expected_result
 ):
     scores, labels = np.array(scores), np.array(labels)
     if sample_weight is not None:
         sample_weight = np.array(sample_weight)
-    calibration_metric = metrics.kuiper_calibration(
-        labels,
-        scores,
-        sample_weight,
-        normalization_method="kuiper_standard_deviation",
-    )
+    calibration_metric = metrics.ecce_sigma(labels, scores, sample_weight)
 
     # Check that the offset is correctly calculated
     np.testing.assert_allclose(calibration_metric, expected_result, atol=1e-5)
@@ -588,18 +581,13 @@ def test_kuiper_calibration_standardized_gives_expected_result(
         ([1.0, 1.0, 1.0, 1.0], [0, 0, 0, 0], None, np.inf),
     ],
 )
-def test_kuiper_calibration_standardized_gives_expected_result_for_scores_resulting_in_zero_kuiper_stat_variance(
+def test_ecce_sigma_gives_expected_result_for_scores_resulting_in_zero_variance(
     scores, labels, sample_weight, expected_result
 ):
     scores, labels = np.array(scores), np.array(labels)
     if sample_weight is not None:
         sample_weight = np.array(sample_weight)
-    calibration_metric = metrics.kuiper_calibration(
-        labels,
-        scores,
-        sample_weight,
-        normalization_method="kuiper_standard_deviation",
-    )
+    calibration_metric = metrics.ecce_sigma(labels, scores, sample_weight)
 
     # Check that the offset is correctly calculated
     np.testing.assert_allclose(calibration_metric, expected_result)
@@ -1009,10 +997,9 @@ def test_that_multicalibrationerror_is_equal_to_ecce_metric_on_single_segment():
         }
     )
 
-    global_kuiper_metric = metrics.kuiper_calibration(
+    global_ecce_metric = metrics.ecce(
         labels=test_df.label.values,
         predicted_scores=test_df.prediction.values,
-        normalization_method=None,
     )
 
     mce = metrics.MulticalibrationError(
@@ -1024,7 +1011,7 @@ def test_that_multicalibrationerror_is_equal_to_ecce_metric_on_single_segment():
         min_samples_per_segment=1,
         sigma_estimation_method=None,
     )
-    assert np.isclose(mce.mce_absolute, global_kuiper_metric, rtol=1e-10, atol=1e-10)
+    assert np.isclose(mce.mce_absolute, global_ecce_metric, rtol=1e-10, atol=1e-10)
 
 
 @pytest.mark.parametrize(
@@ -1038,7 +1025,7 @@ def test_that_multicalibrationerror_is_equal_to_ecce_metric_on_single_segment():
         metrics.calibration_ratio,
         metrics.adaptive_calibration_error,
         metrics.expected_calibration_error,
-        metrics.kuiper_calibration,
+        metrics.ecce,
     ],
 )
 def test_wrap_sklearn_metric_func_does_not_raise_an_error_with_any_of_our_main_metrics(
@@ -1723,18 +1710,16 @@ def test_kuiper_per_segment_does_not_modify_segments_df(rng, metric_func):
         (metrics.expected_calibration_error, {}, False),
         (metrics.calibration_ratio, {}, True),
         (metrics.calibration_ratio, {}, False),
-        (
-            metrics.kuiper_calibration,
-            {"normalization_method": "kuiper_standard_deviation"},
-            True,
-        ),
-        (
-            metrics.kuiper_calibration,
-            {"normalization_method": "kuiper_standard_deviation"},
-            False,
-        ),
+        (metrics.ecce_sigma, {}, True),
+        (metrics.ecce_sigma, {}, False),
         (metrics.normalized_entropy, {}, True),
         (metrics.normalized_entropy, {}, False),
+        (metrics.ecce, {}, True),
+        (metrics.ecce, {}, False),
+        (metrics.ecce_sigma, {}, True),
+        (metrics.ecce_sigma, {}, False),
+        (metrics.ecce_pvalue, {}, True),
+        (metrics.ecce_pvalue, {}, False),
     ],
 )
 def test_metric_does_not_modify_input_arrays(
@@ -2611,3 +2596,11 @@ def test_ndcg_score_returns_nan_on_empty_arrays():
     predicted_labels = np.array([])
     result = metrics.ndcg_score(labels, predicted_labels)
     assert np.isnan(result), f"Expected NaN for empty arrays, got {result}"
+
+
+def test_ecce_pvalue_consistency_with_kuiper_pvalue(rng):
+    labels = rng.randint(0, 2, 100)
+    predicted_scores = rng.rand(100)
+    ecce_pvalue_result = metrics.ecce_pvalue(labels, predicted_scores)
+    _, kuiper_pvalue_result = metrics.kuiper_test(labels, predicted_scores)
+    assert ecce_pvalue_result == pytest.approx(kuiper_pvalue_result, rel=1e-10)