facebookincubator · TaXxER · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025
@@ -46,9 +46,8 @@ def _calibration_error(
     :param bin_error_func: Function to calculate the error between the empirically observed rate and the estimated rate
     :param adjust_unjoined: Boolean flag indicating whether the input data is "unjoined data". In unjoined data there is
                            always a row with a negative label and there will be another row with positive label if it is a positive instance.
-                           This means that for positive instances there are two rows: one with a positive and one with a negative label. This
-                           is often used in datasets from the Ads organization. On unjoined datasets we need to make an adjustment to get an
-                           unbiased estimate of calibration error
+                           This means that for positive instances there are two rows: one with a positive and one with a negative label. On
+                           unjoined datasets we need to make an adjustment to get an unbiased estimate of calibration error
     :param sample_weight: Array of weights for each instance. If None, then all instances are considered to have weight 1
     :return: The calibration error as a float
     """
@@ -234,6 +233,8 @@ def fpr(
     sample_weight: npt.NDArray | None = None,
     **kwargs: Any,
 ) -> float:
+    if len(labels) == 0:
+        return 0.0
     cm = skmetrics.confusion_matrix(
         y_true=labels.astype(int), y_pred=predicted_labels, sample_weight=sample_weight
     )
@@ -917,8 +918,6 @@ def kuiper_calibration(
 
 def kuiper_distribution(x: float) -> float:
     """
-    Source: https://github.com/facebookresearch/ecevecce/blob/main/codes/dists.py
-
     Evaluates the cumulative distribution function for the range
     (maximum minus minimum) of the standard Brownian motion on [0, 1].
 
@@ -984,7 +983,7 @@ def kuiper_test(
     :return: A tuple containing the Kuiper statistic and the corresponding p-value.
     """
 
-    KUIPER_MAX: float = 8.26732673  # See diff D76596250
+    KUIPER_MAX: float = 8.26732673
     KUIPER_MIN: float = 1e-20
 
     kuiper_statistic = kuiper_calibration(
@@ -1202,6 +1201,7 @@ def _rank_calibration_error(
     labels: npt.NDArray,
     predicted_labels: npt.NDArray,
     num_bins: int = CALIBRATION_ERROR_NUM_BINS,
+    rng: np.random.RandomState | None = None,
 ) -> tuple[float, npt.NDArray, npt.NDArray]:
     """
     Calculates rank calibration error as proposed in: https://arxiv.org/pdf/2404.03163
@@ -1212,7 +1212,8 @@ def _rank_calibration_error(
     :return: tuple (RCE, label_cdfs, prediction_cdfs)
     """
     # break ties
-    eps = np.random.uniform(0, 1, labels.shape[0]) * CALIBRATION_ERROR_EPSILON
+    rng = np.random.RandomState(42) if rng is None else rng
+    eps = rng.uniform(0, 1, labels.shape[0]) * CALIBRATION_ERROR_EPSILON
     labels = labels + eps
     predicted_labels = predicted_labels + eps
 
@@ -1230,10 +1231,7 @@ def _rank_calibration_error(
         prediction_means[i - 1] = np.mean(sorted_predictions[low:high])
 
     label_cdfs = np.array(
-        [
-            (np.sum([label_means[i] >= label_means])) / (num_bins)
-            for i in range(num_bins)
-        ]
+        [np.sum(label_means[i] >= label_means) / num_bins for i in range(num_bins)]
     )
 
     prediction_cdfs = np.array(
@@ -1407,7 +1405,6 @@ def calibration_free_normalized_entropy(
 ) -> float:
     """
     Calculates the Calibration-Free normalized entropy.
-    Follows the logic from: https://fb.workplace.com/notes/743728179515952/.
 
     :param labels: Ground truth (correct) labels for n_samples samples.
     :param predictions: Predicted probabilities, as returned by a classifier's predict_proba method.
@@ -1460,8 +1457,6 @@ def __init__(
         """
         Calculates the multicalibration error with respect to a set of segments for a given dataset.
 
-        See this wiki for a detailed description of the metric: https://www.internalfb.com/wiki/MCBoost/Measuring_Multicalibration/Methodology_Deep_Dive
-
         :param df: A pandas DataFrame containing the data.
         :param label_column: The name of the column in `df` that contains the true labels.
         :param score_column: The name of the column in `df` that contains the predicted scores.

@@ -345,9 +345,9 @@ def test_multi_cg_gives_same_result_as_cg_per_segment(rank_discount, rng):
     df["segment_1"] = rng.choice(["A", "B"], size=len(df))
     df["segment_2"] = rng.choice(["C", "D"], size=len(df))
 
-    min_segmens_size = df.groupby(by=["segment_1", "segment_2"]).count().values.min()
-    print(f"{min_segmens_size=}")
-    k = min(k, min_segmens_size)
+    min_segments_size = df.groupby(by=["segment_1", "segment_2"]).count().values.min()
+    print(f"{min_segments_size=}")
+    k = min(k, min_segments_size)
 
     multi_cg_scores = metrics.multi_cg_score(
         labels=df["label"],
@@ -852,10 +852,6 @@ def test_multi_RCE_is_more_for_groups_with_worse_ranking_quality(rng):
 
 
 def test_normalized_entropy_gives_expected_result():
-    """
-    The multicalibration test case for the normalized entropy (NE) metric shares the input data
-    and expected values with the F6 implementation of NE: https://www.internalfb.com/diff/D30597707.
-    """
     y_pred = np.array([0.2, 0.3, 0.5, 0.1, 0.3, 0.5, 0.2])
     y_true = np.array([0, 0, 1, 0, 1, 0, 1])
     result = metrics.normalized_entropy(y_true, y_pred)
@@ -867,7 +863,6 @@ def test_normalized_entropy_gives_expected_result():
     "y_pred,y_true,sample_weight,expected",
     [
         (
-            # NE unit test of F6, like in D30597707, with equal sample weights, which should give same result
             np.array([0.2, 0.3, 0.5, 0.1, 0.3, 0.5, 0.2]),
             np.array([0, 0, 1, 0, 1, 0, 1]),
             np.array([2, 2, 2, 2, 2, 2, 2]),
@@ -884,11 +879,6 @@ def test_normalized_entropy_gives_expected_result():
 def test_normalized_entropy_with_sample_weights_gives_expected_result(
     y_pred, y_true, sample_weight, expected
 ):
-    """
-    The multicalibration test case for the normalized entropy (NE) metric shares the input data
-    and expected values with the F6 implementation of NE: https://www.internalfb.com/diff/D30597707.
-    """
-
     result = metrics.normalized_entropy(y_true, y_pred, sample_weight=sample_weight)
     assert result == pytest.approx(expected)
 
@@ -1234,7 +1224,7 @@ def test_mce_speedup_returns_values_equal_for_different_chunk_sizes(rng):
     assert np.equal(mce_chunk25.segment_sigmas, mce_chunk7.segment_sigmas).all()
 
 
-def mce_sorting_does_not_modify_original_df(rng):
+def test_mce_sorting_does_not_modify_original_df(rng):
     # Check that the original df remains unchanged after being passed into the MCE metric and locally sorted
     n_cat_fts = 3
     n_num_fts = 3