facebookincubator · flinder · Jan 6, 2026 · Jan 7, 2026
@@ -46,21 +46,32 @@ def get_segment_masks(
     None,
 ]:
     """
-    Generates boolean masks for the dataframe segmented based on combinations of categorical and numerical segmentation feature values.
+    Generate boolean masks for dataframe segments.
+
+    Segments are based on combinations of categorical and numerical
+    segmentation feature values.
 
     :param df: The dataframe to segment.
-    :param categorical_segment_columns: A list of column names in the dataframe that are categorical.
-    :param numerical_segment_columns: A list of column names in the dataframe that are numerical.
-    :param min_depth: The minimum depth of combinations to consider for creating segments.
-    :param max_depth: The maximum depth of combinations to consider for creating segments. If None segmentation will continue until all combinations are considered.
-    :param max_values_per_segment_feature: The maximum number of unique values (or bins for numerical columns) to retain per segment feature before collapsing others into a distinct category (or bin).
-    :param min_samples_per_segment: The minimum number of samples per segment to be returned. Segments with fewer samples will be discarded if return_small_segments is False.
+    :param categorical_segment_columns: Column names that are categorical.
+    :param numerical_segment_columns: Column names that are numerical.
+    :param min_depth: Minimum depth of combinations for creating segments.
+    :param max_depth: Maximum depth of combinations for creating segments.
+        If None, segmentation continues until all combinations are considered.
+    :param max_values_per_segment_feature: Maximum unique values (or bins for
+        numerical columns) to retain per segment feature before collapsing
+        others into a distinct category (or bin).
+    :param min_samples_per_segment: Minimum samples per segment to be returned.
+        Segments with fewer samples will be discarded.
     :param chunk_size: The number of segments to return in each chunk.
-    :return: A generator that yields a tuple (chunk, n_segments_in_chunk), where chunk is an array of booleans corresponding to whether a sample belongs to the segment.
+    :return: A generator yielding tuples of (chunk, n_segments_in_chunk,
+        feature_values_df), where chunk is an array of booleans corresponding
+        to whether a sample belongs to the segment.
+
     Notes:
-    ------
-    - If both `categorical_segment_columns` and `numerical_segment_columns` are None, all samples of the dataframe are yielded as a single segment.
-    - Missing values in categorical and numerical segment columns are replaced with a predefined constant and a warning is logged.
+        - If both `categorical_segment_columns` and `numerical_segment_columns`
+          are None, all samples are yielded as a single segment.
+        - Missing values in segment columns are replaced with a predefined
+          constant and a warning is logged.
     """
     if categorical_segment_columns is None and numerical_segment_columns is None:
         yield (
@@ -209,8 +220,12 @@ def replace_missing_values(
         for col in numerical_segment_columns:
             df_subset[col] = df_subset[col].fillna(NA_SEGMENT_VALUE_NUMERICAL)
         logger.debug(
-            f"Missing values found in the data. Replaced with {NA_SEGMENT_VALUE_CATEGORICAL} for categorical and {NA_SEGMENT_VALUE_NUMERICAL} for numerical data."
-            " Missing values are treated as an additional segment feature value and are not counted towards the specified max_values_per_segment_feature limit."
+            "Missing values found in the data. Replaced with %s for categorical "
+            "and %s for numerical data. Missing values are treated as an "
+            "additional segment feature value and are not counted towards the "
+            "specified max_values_per_segment_feature limit.",
+            NA_SEGMENT_VALUE_CATEGORICAL,
+            NA_SEGMENT_VALUE_NUMERICAL,
         )
     return df_subset
 

@@ -52,7 +52,7 @@ def unshrink(
     for rec_warn in recorded_warnings:
         if isinstance(rec_warn.message, LineSearchWarning):
             logger.info(
-                f"Line search warning (unshrink): {str(rec_warn.message)}. Solution is approximately optimal - no ideal step size for the gradient descent update can be found. These warnings are generally harmless, see D69983734 for details."
+                f"Line search warning (unshrink): {str(rec_warn.message)}. Solution is approximately optimal - no ideal step size for the gradient descent update can be found. These warnings are generally harmless."
             )
         else:
             logger.debug(rec_warn)
@@ -279,12 +279,14 @@ def geometric_mean(x: np.ndarray) -> float:
 
 def make_unjoined(x: np.ndarray, y: np.ndarray) -> tuple[Any, Any]:
     """
-    In the Ads organization, it is common to work with a data format that is commonly referred to as
-    'unjoined'. This means that there is always a row with a negative label and there will be a
-    second row with positive label if there is a conversion. This means that we will have 2 rows
-    for the same impression in case that that impression resulted in a conversion.
+    Converts a regular dataset to 'unjoined' format. In the unjoined format, there is always
+    a row with a negative label and there will be a second row with a positive label added to
+    the dataset for the same instance if is actually a positive instance.
 
-    This method takes a regular dataset (one row per impression) and returns an unjoined
+    This means that we will have 2 rows for the same row in case
+    that impression has a positive label.
+
+    This method takes a regular dataset and returns an unjoined
     version of that dataset.
 
     :param x: array of features
@@ -351,14 +353,10 @@ def deserialize(cls, encoder_str) -> "OrdinalEncoderWithUnknownSupport":
 
 def hash_categorical_feature(categorical_feature: str) -> int:
     """
-    This implements the categorical feature encoding scheme that @Jiayuanm implemented in Hack: D56290586
-    It uses the last two bytes of SHA256 for categorical features.
+    Hashes a categorical feature using the last two bytes of SHA256.
 
-    Sometimes we need to perform the equivalent encoding in Presto, which can be done with:
+    The equivalent encoding in Presto can be done with:
         FROM_BASE(SUBSTR(TO_HEX(SHA256(CAST(categorical_feature AS VARBINARY))), -4), 16)
-
-    This Daiquery link shows that that generates equivalent output for all test cases of test_hash_categorical_feature in test_base.py:
-        https://fburl.com/daiquery/neiirvol
     """
     signature = hashlib.sha256(categorical_feature.encode("utf-8")).digest().hex()
     last_four_hex_chars = signature[-4:]

@@ -32,7 +32,7 @@ def test_that_get_segment_masks_returns_full_data_at_depth_zero():
 
 
 def test_that_get_segment_masks_works_as_expected_with_nans():
-    # Expected behavior is that nana values are treated as a separate segment
+    # Expected behavior is that NaN values are treated as a separate segment
     test_df = pd.DataFrame({"segment_A": ["a", np.nan, "b", np.nan]})
     generator = segmentation.get_segment_masks(
         test_df,
@@ -103,7 +103,7 @@ def test_that_get_segment_masks_returns_whole_dataset_if_no_features_are_specifi
 
 
 def test_that_get_segment_masks_works_as_expected_with_nans_in_numerical_feature():
-    # Expected behavior is that nana values are treated as a separate segment
+    # Expected behavior is that NaN values are treated as a separate segment
     df = pd.DataFrame({"segment_A": [0.1, None, 0.3, None, 0.4, 0.5, 0.6]})
     generator = segmentation.get_segment_masks(
         df,
@@ -274,7 +274,7 @@ def test_that_collapse_infrequent_values_collapses_all_values_to_collapse_value_
         ),
     ],
 )
-def test_that_collapse_infequent_values_collapses_correctly_for_happy_path(
+def test_that_collapse_infrequent_values_collapses_correctly_for_happy_path(
     test_array, expected
 ):
     results = segmentation.collapse_infrequent_values(
@@ -296,7 +296,7 @@ def test_that_collapse_numeric_values_returns_identity_for_unique_values_lt_max_
     assert np.array_equal(results, test_array, equal_nan=True)
 
 
-def test_that_collapse_numeric_values_returns_correct_numer_of_values():
+def test_that_collapse_numeric_values_returns_correct_number_of_values():
     test_array = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
     results = segmentation.collapse_numeric_values(test_array, max_unique_values=3)
     assert len(np.unique(results)) == 3
@@ -317,7 +317,7 @@ def test_that_collapse_numeric_values_missing_values_do_not_affect_other_rows():
     )
 
 
-def test_that_collapse_numeric_values_returns_correct_numer_of_values_with_max_values_1():
+def test_that_collapse_numeric_values_returns_correct_number_of_values_with_max_values_1():
     test_array = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
     results = segmentation.collapse_numeric_values(test_array, max_unique_values=1)
     assert len(np.unique(results)) == 1

@@ -293,16 +293,12 @@ def test_make_unjoined_gives_expected_result(x, y, expected_x, expected_y):
 @pytest.mark.parametrize(
     "categorical_feature,expected_result",
     [
-        ("INDIAN", 31255),
-        ("SUB_SAHARAN_AFRICA", 46892),
-        ("VIETNAMESE", 22530),
+        ("TOKYO", 54410),
+        ("AMSTERDAM", 42395),
+        ("JAKARTA", 21470),
     ],
 )
 def test_hash_categorical_feature(categorical_feature, expected_result):
-    """
-    This unit test checks for equivalence with @jiayuanm's Hack implementation in D56290586.
-    Reference values created in Hack kernel notebook N5246895.
-    """
     actual_result = utils.hash_categorical_feature(categorical_feature)
     assert actual_result == expected_result