Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 28 additions & 13 deletions src/multicalibration/segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,32 @@ def get_segment_masks(
None,
]:
"""
Generates boolean masks for the dataframe segmented based on combinations of categorical and numerical segmentation feature values.
Generate boolean masks for dataframe segments.

Segments are based on combinations of categorical and numerical
segmentation feature values.

:param df: The dataframe to segment.
:param categorical_segment_columns: A list of column names in the dataframe that are categorical.
:param numerical_segment_columns: A list of column names in the dataframe that are numerical.
:param min_depth: The minimum depth of combinations to consider for creating segments.
:param max_depth: The maximum depth of combinations to consider for creating segments. If None segmentation will continue until all combinations are considered.
:param max_values_per_segment_feature: The maximum number of unique values (or bins for numerical columns) to retain per segment feature before collapsing others into a distinct category (or bin).
:param min_samples_per_segment: The minimum number of samples per segment to be returned. Segments with fewer samples will be discarded if return_small_segments is False.
:param categorical_segment_columns: Column names that are categorical.
:param numerical_segment_columns: Column names that are numerical.
:param min_depth: Minimum depth of combinations for creating segments.
:param max_depth: Maximum depth of combinations for creating segments.
If None, segmentation continues until all combinations are considered.
:param max_values_per_segment_feature: Maximum unique values (or bins for
numerical columns) to retain per segment feature before collapsing
others into a distinct category (or bin).
:param min_samples_per_segment: Minimum samples per segment to be returned.
Segments with fewer samples will be discarded.
:param chunk_size: The number of segments to return in each chunk.
:return: A generator that yields a tuple (chunk, n_segments_in_chunk), where chunk is an array of booleans corresponding to whether a sample belongs to the segment.
:return: A generator yielding tuples of (chunk, n_segments_in_chunk,
feature_values_df), where chunk is an array of booleans corresponding
to whether a sample belongs to the segment.

Notes:
------
- If both `categorical_segment_columns` and `numerical_segment_columns` are None, all samples of the dataframe are yielded as a single segment.
- Missing values in categorical and numerical segment columns are replaced with a predefined constant and a warning is logged.
- If both `categorical_segment_columns` and `numerical_segment_columns`
are None, all samples are yielded as a single segment.
- Missing values in segment columns are replaced with a predefined
constant and a warning is logged.
"""
if categorical_segment_columns is None and numerical_segment_columns is None:
yield (
Expand Down Expand Up @@ -209,8 +220,12 @@ def replace_missing_values(
for col in numerical_segment_columns:
df_subset[col] = df_subset[col].fillna(NA_SEGMENT_VALUE_NUMERICAL)
logger.debug(
f"Missing values found in the data. Replaced with {NA_SEGMENT_VALUE_CATEGORICAL} for categorical and {NA_SEGMENT_VALUE_NUMERICAL} for numerical data."
" Missing values are treated as an additional segment feature value and are not counted towards the specified max_values_per_segment_feature limit."
"Missing values found in the data. Replaced with %s for categorical "
"and %s for numerical data. Missing values are treated as an "
"additional segment feature value and are not counted towards the "
"specified max_values_per_segment_feature limit.",
NA_SEGMENT_VALUE_CATEGORICAL,
NA_SEGMENT_VALUE_NUMERICAL,
)
return df_subset

Expand Down
22 changes: 10 additions & 12 deletions src/multicalibration/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def unshrink(
for rec_warn in recorded_warnings:
if isinstance(rec_warn.message, LineSearchWarning):
logger.info(
f"Line search warning (unshrink): {str(rec_warn.message)}. Solution is approximately optimal - no ideal step size for the gradient descent update can be found. These warnings are generally harmless, see D69983734 for details."
f"Line search warning (unshrink): {str(rec_warn.message)}. Solution is approximately optimal - no ideal step size for the gradient descent update can be found. These warnings are generally harmless."
)
else:
logger.debug(rec_warn)
Expand Down Expand Up @@ -279,12 +279,14 @@ def geometric_mean(x: np.ndarray) -> float:

def make_unjoined(x: np.ndarray, y: np.ndarray) -> tuple[Any, Any]:
"""
In the Ads organization, it is common to work with a data format that is commonly referred to as
'unjoined'. This means that there is always a row with a negative label and there will be a
second row with positive label if there is a conversion. This means that we will have 2 rows
for the same impression in case that that impression resulted in a conversion.
Converts a regular dataset to 'unjoined' format. In the unjoined format, there is always
a row with a negative label and there will be a second row with a positive label added to
the dataset for the same instance if is actually a positive instance.

This method takes a regular dataset (one row per impression) and returns an unjoined
This means that we will have 2 rows for the same row in case
that impression has a positive label.

This method takes a regular dataset and returns an unjoined
version of that dataset.

:param x: array of features
Expand Down Expand Up @@ -351,14 +353,10 @@ def deserialize(cls, encoder_str) -> "OrdinalEncoderWithUnknownSupport":

def hash_categorical_feature(categorical_feature: str) -> int:
"""
This implements the categorical feature encoding scheme that @Jiayuanm implemented in Hack: D56290586
It uses the last two bytes of SHA256 for categorical features.
Hashes a categorical feature using the last two bytes of SHA256.

Sometimes we need to perform the equivalent encoding in Presto, which can be done with:
The equivalent encoding in Presto can be done with:
FROM_BASE(SUBSTR(TO_HEX(SHA256(CAST(categorical_feature AS VARBINARY))), -4), 16)

This Daiquery link shows that that generates equivalent output for all test cases of test_hash_categorical_feature in test_base.py:
https://fburl.com/daiquery/neiirvol
"""
signature = hashlib.sha256(categorical_feature.encode("utf-8")).digest().hex()
last_four_hex_chars = signature[-4:]
Expand Down
10 changes: 5 additions & 5 deletions tests/test_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def test_that_get_segment_masks_returns_full_data_at_depth_zero():


def test_that_get_segment_masks_works_as_expected_with_nans():
# Expected behavior is that nana values are treated as a separate segment
# Expected behavior is that NaN values are treated as a separate segment
test_df = pd.DataFrame({"segment_A": ["a", np.nan, "b", np.nan]})
generator = segmentation.get_segment_masks(
test_df,
Expand Down Expand Up @@ -103,7 +103,7 @@ def test_that_get_segment_masks_returns_whole_dataset_if_no_features_are_specifi


def test_that_get_segment_masks_works_as_expected_with_nans_in_numerical_feature():
# Expected behavior is that nana values are treated as a separate segment
# Expected behavior is that NaN values are treated as a separate segment
df = pd.DataFrame({"segment_A": [0.1, None, 0.3, None, 0.4, 0.5, 0.6]})
generator = segmentation.get_segment_masks(
df,
Expand Down Expand Up @@ -274,7 +274,7 @@ def test_that_collapse_infrequent_values_collapses_all_values_to_collapse_value_
),
],
)
def test_that_collapse_infequent_values_collapses_correctly_for_happy_path(
def test_that_collapse_infrequent_values_collapses_correctly_for_happy_path(
test_array, expected
):
results = segmentation.collapse_infrequent_values(
Expand All @@ -296,7 +296,7 @@ def test_that_collapse_numeric_values_returns_identity_for_unique_values_lt_max_
assert np.array_equal(results, test_array, equal_nan=True)


def test_that_collapse_numeric_values_returns_correct_numer_of_values():
def test_that_collapse_numeric_values_returns_correct_number_of_values():
test_array = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
results = segmentation.collapse_numeric_values(test_array, max_unique_values=3)
assert len(np.unique(results)) == 3
Expand All @@ -317,7 +317,7 @@ def test_that_collapse_numeric_values_missing_values_do_not_affect_other_rows():
)


def test_that_collapse_numeric_values_returns_correct_numer_of_values_with_max_values_1():
def test_that_collapse_numeric_values_returns_correct_number_of_values_with_max_values_1():
test_array = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
results = segmentation.collapse_numeric_values(test_array, max_unique_values=1)
assert len(np.unique(results)) == 1
Expand Down
10 changes: 3 additions & 7 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,16 +293,12 @@ def test_make_unjoined_gives_expected_result(x, y, expected_x, expected_y):
@pytest.mark.parametrize(
"categorical_feature,expected_result",
[
("INDIAN", 31255),
("SUB_SAHARAN_AFRICA", 46892),
("VIETNAMESE", 22530),
("TOKYO", 54410),
("AMSTERDAM", 42395),
("JAKARTA", 21470),
],
)
def test_hash_categorical_feature(categorical_feature, expected_result):
"""
This unit test checks for equivalence with @jiayuanm's Hack implementation in D56290586.
Reference values created in Hack kernel notebook N5246895.
"""
actual_result = utils.hash_categorical_feature(categorical_feature)
assert actual_result == expected_result

Expand Down