Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 9 additions & 14 deletions src/multicalibration/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,8 @@ def _calibration_error(
:param bin_error_func: Function to calculate the error between the empirically observed rate and the estimated rate
:param adjust_unjoined: Boolean flag indicating whether the input data is "unjoined data". In unjoined data there is
always a row with a negative label and there will be another row with positive label if it is a positive instance.
This means that for positive instances there are two rows: one with a positive and one with a negative label. This
is often used in datasets from the Ads organization. On unjoined datasets we need to make an adjustment to get an
unbiased estimate of calibration error
This means that for positive instances there are two rows: one with a positive and one with a negative label. On
unjoined datasets we need to make an adjustment to get an unbiased estimate of calibration error
:param sample_weight: Array of weights for each instance. If None, then all instances are considered to have weight 1
:return: The calibration error as a float
"""
Expand Down Expand Up @@ -234,6 +233,8 @@ def fpr(
sample_weight: npt.NDArray | None = None,
**kwargs: Any,
) -> float:
if len(labels) == 0:
return 0.0
cm = skmetrics.confusion_matrix(
y_true=labels.astype(int), y_pred=predicted_labels, sample_weight=sample_weight
)
Expand Down Expand Up @@ -917,8 +918,6 @@ def kuiper_calibration(

def kuiper_distribution(x: float) -> float:
"""
Source: https://github.com/facebookresearch/ecevecce/blob/main/codes/dists.py

Evaluates the cumulative distribution function for the range
(maximum minus minimum) of the standard Brownian motion on [0, 1].

Expand Down Expand Up @@ -984,7 +983,7 @@ def kuiper_test(
:return: A tuple containing the Kuiper statistic and the corresponding p-value.
"""

KUIPER_MAX: float = 8.26732673 # See diff D76596250
KUIPER_MAX: float = 8.26732673
KUIPER_MIN: float = 1e-20

kuiper_statistic = kuiper_calibration(
Expand Down Expand Up @@ -1202,6 +1201,7 @@ def _rank_calibration_error(
labels: npt.NDArray,
predicted_labels: npt.NDArray,
num_bins: int = CALIBRATION_ERROR_NUM_BINS,
rng: np.random.RandomState | None = None,
) -> tuple[float, npt.NDArray, npt.NDArray]:
"""
Calculates rank calibration error as proposed in: https://arxiv.org/pdf/2404.03163
Expand All @@ -1212,7 +1212,8 @@ def _rank_calibration_error(
:return: tuple (RCE, label_cdfs, prediction_cdfs)
"""
# break ties
eps = np.random.uniform(0, 1, labels.shape[0]) * CALIBRATION_ERROR_EPSILON
rng = np.random.RandomState(42) if rng is None else rng
eps = rng.uniform(0, 1, labels.shape[0]) * CALIBRATION_ERROR_EPSILON
labels = labels + eps
predicted_labels = predicted_labels + eps

Expand All @@ -1230,10 +1231,7 @@ def _rank_calibration_error(
prediction_means[i - 1] = np.mean(sorted_predictions[low:high])

label_cdfs = np.array(
[
(np.sum([label_means[i] >= label_means])) / (num_bins)
for i in range(num_bins)
]
[np.sum(label_means[i] >= label_means) / num_bins for i in range(num_bins)]
)

prediction_cdfs = np.array(
Expand Down Expand Up @@ -1407,7 +1405,6 @@ def calibration_free_normalized_entropy(
) -> float:
"""
Calculates the Calibration-Free normalized entropy.
Follows the logic from: https://fb.workplace.com/notes/743728179515952/.

:param labels: Ground truth (correct) labels for n_samples samples.
:param predictions: Predicted probabilities, as returned by a classifier's predict_proba method.
Expand Down Expand Up @@ -1460,8 +1457,6 @@ def __init__(
"""
Calculates the multicalibration error with respect to a set of segments for a given dataset.

See this wiki for a detailed description of the metric: https://www.internalfb.com/wiki/MCBoost/Measuring_Multicalibration/Methodology_Deep_Dive

:param df: A pandas DataFrame containing the data.
:param label_column: The name of the column in `df` that contains the true labels.
:param score_column: The name of the column in `df` that contains the predicted scores.
Expand Down
18 changes: 4 additions & 14 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,9 +345,9 @@ def test_multi_cg_gives_same_result_as_cg_per_segment(rank_discount, rng):
df["segment_1"] = rng.choice(["A", "B"], size=len(df))
df["segment_2"] = rng.choice(["C", "D"], size=len(df))

min_segmens_size = df.groupby(by=["segment_1", "segment_2"]).count().values.min()
print(f"{min_segmens_size=}")
k = min(k, min_segmens_size)
min_segments_size = df.groupby(by=["segment_1", "segment_2"]).count().values.min()
print(f"{min_segments_size=}")
k = min(k, min_segments_size)

multi_cg_scores = metrics.multi_cg_score(
labels=df["label"],
Expand Down Expand Up @@ -852,10 +852,6 @@ def test_multi_RCE_is_more_for_groups_with_worse_ranking_quality(rng):


def test_normalized_entropy_gives_expected_result():
"""
The multicalibration test case for the normalized entropy (NE) metric shares the input data
and expected values with the F6 implementation of NE: https://www.internalfb.com/diff/D30597707.
"""
y_pred = np.array([0.2, 0.3, 0.5, 0.1, 0.3, 0.5, 0.2])
y_true = np.array([0, 0, 1, 0, 1, 0, 1])
result = metrics.normalized_entropy(y_true, y_pred)
Expand All @@ -867,7 +863,6 @@ def test_normalized_entropy_gives_expected_result():
"y_pred,y_true,sample_weight,expected",
[
(
# NE unit test of F6, like in D30597707, with equal sample weights, which should give same result
np.array([0.2, 0.3, 0.5, 0.1, 0.3, 0.5, 0.2]),
np.array([0, 0, 1, 0, 1, 0, 1]),
np.array([2, 2, 2, 2, 2, 2, 2]),
Expand All @@ -884,11 +879,6 @@ def test_normalized_entropy_gives_expected_result():
def test_normalized_entropy_with_sample_weights_gives_expected_result(
y_pred, y_true, sample_weight, expected
):
"""
The multicalibration test case for the normalized entropy (NE) metric shares the input data
and expected values with the F6 implementation of NE: https://www.internalfb.com/diff/D30597707.
"""

result = metrics.normalized_entropy(y_true, y_pred, sample_weight=sample_weight)
assert result == pytest.approx(expected)

Expand Down Expand Up @@ -1234,7 +1224,7 @@ def test_mce_speedup_returns_values_equal_for_different_chunk_sizes(rng):
assert np.equal(mce_chunk25.segment_sigmas, mce_chunk7.segment_sigmas).all()


def mce_sorting_does_not_modify_original_df(rng):
def test_mce_sorting_does_not_modify_original_df(rng):
# Check that the original df remains unchanged after being passed into the MCE metric and locally sorted
n_cat_fts = 3
n_num_fts = 3
Expand Down