Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 9 additions & 13 deletions src/multicalibration/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def unshrink(
for rec_warn in recorded_warnings:
if isinstance(rec_warn.message, LineSearchWarning):
logger.info(
f"Line search warning (unshrink): {str(rec_warn.message)}. Solution is approximately optimal - no ideal step size for the gradient descent update can be found. These warnings are generally harmless, see D69983734 for details."
f"Line search warning (unshrink): {str(rec_warn.message)}. Solution is approximately optimal - no ideal step size for the gradient descent update can be found. These warnings are generally harmless."
)
else:
logger.debug(rec_warn)
Expand Down Expand Up @@ -279,13 +279,13 @@ def geometric_mean(x: np.ndarray) -> float:

def make_unjoined(x: np.ndarray, y: np.ndarray) -> tuple[Any, Any]:
"""
In the Ads organization, it is common to work with a data format that is commonly referred to as
'unjoined'. This means that there is always a row with a negative label and there will be a
second row with positive label if there is a conversion. This means that we will have 2 rows
for the same impression in case that that impression resulted in a conversion.
Converts a regular dataset to 'unjoined' format. In the unjoined format, there is always
a row with a negative label and there will be a second row with a positive label added to
the dataset for the same instance if is actually a positive instance. This contrasts a
regular dataset where each instance is represented by a single row with either a positive
or negative label.

This method takes a regular dataset (one row per impression) and returns an unjoined
version of that dataset.
This method takes a regular dataset and returns an unjoined version of that dataset.

:param x: array of features
:param y: array of labels
Expand Down Expand Up @@ -351,14 +351,10 @@ def deserialize(cls, encoder_str) -> "OrdinalEncoderWithUnknownSupport":

def hash_categorical_feature(categorical_feature: str) -> int:
"""
This implements the categorical feature encoding scheme that @Jiayuanm implemented in Hack: D56290586
It uses the last two bytes of SHA256 for categorical features.
Hashes a categorical feature using the last two bytes of SHA256.

Sometimes we need to perform the equivalent encoding in Presto, which can be done with:
The equivalent encoding in Presto can be done with:
FROM_BASE(SUBSTR(TO_HEX(SHA256(CAST(categorical_feature AS VARBINARY))), -4), 16)

This Daiquery link shows that that generates equivalent output for all test cases of test_hash_categorical_feature in test_base.py:
https://fburl.com/daiquery/neiirvol
"""
signature = hashlib.sha256(categorical_feature.encode("utf-8")).digest().hex()
last_four_hex_chars = signature[-4:]
Expand Down
10 changes: 3 additions & 7 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,16 +293,12 @@ def test_make_unjoined_gives_expected_result(x, y, expected_x, expected_y):
@pytest.mark.parametrize(
"categorical_feature,expected_result",
[
("INDIAN", 31255),
("SUB_SAHARAN_AFRICA", 46892),
("VIETNAMESE", 22530),
("TOKYO", 54410),
("AMSTERDAM", 42395),
("JAKARTA", 21470),
],
)
def test_hash_categorical_feature(categorical_feature, expected_result):
"""
This unit test checks for equivalence with @jiayuanm's Hack implementation in D56290586.
Reference values created in Hack kernel notebook N5246895.
"""
actual_result = utils.hash_categorical_feature(categorical_feature)
assert actual_result == expected_result

Expand Down