feat: add ml.preprocessing.PolynomialFeatures class (#793)

GarrettWu · web-flow · commit b4fbb5187119 · 2024-06-21T13:50:55.000-07:00
diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py
@@ -24,6 +24,7 @@
 import bigframes_vendored.sklearn.preprocessing._discretization
 import bigframes_vendored.sklearn.preprocessing._encoder
 import bigframes_vendored.sklearn.preprocessing._label
+import bigframes_vendored.sklearn.preprocessing._polynomial
 
 from bigframes.core import log_adapter
 from bigframes.ml import base, core, globals, utils
@@ -661,6 +662,109 @@ def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
         )
 
 
+@log_adapter.class_logger
+class PolynomialFeatures(
+    base.Transformer,
+    bigframes_vendored.sklearn.preprocessing._polynomial.PolynomialFeatures,
+):
+    __doc__ = (
+        bigframes_vendored.sklearn.preprocessing._polynomial.PolynomialFeatures.__doc__
+    )
+
+    def __init__(self, degree: int = 2):
+        self.degree = degree
+        self._bqml_model: Optional[core.BqmlModel] = None
+        self._bqml_model_factory = globals.bqml_model_factory()
+        self._base_sql_generator = globals.base_sql_generator()
+
+    # TODO(garrettwu): implement __hash__
+    def __eq__(self, other: Any) -> bool:
+        return (
+            type(other) is PolynomialFeatures and self._bqml_model == other._bqml_model
+        )
+
+    def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]:
+        """Compile this transformer to a list of SQL expressions that can be included in
+        a BQML TRANSFORM clause
+
+        Args:
+            columns:
+                a list of column names to transform.
+            X (default None):
+                Ignored.
+
+        Returns: a list of tuples of (sql_expression, output_name)"""
+        output_name = "poly_feat"
+        return [
+            (
+                self._base_sql_generator.ml_polynomial_expand(
+                    columns, self.degree, output_name
+                ),
+                output_name,
+            )
+        ]
+
+    @classmethod
+    def _parse_from_sql(cls, sql: str) -> tuple[PolynomialFeatures, str]:
+        """Parse SQL to tuple(PolynomialFeatures, column_label).
+
+        Args:
+            sql: SQL string of format "ML.POLYNOMIAL_EXPAND(STRUCT(col_label0, col_label1, ...), degree)"
+
+        Returns:
+            tuple(MaxAbsScaler, column_label)"""
+        col_label = sql[sql.find("STRUCT(") + 7 : sql.find(")")]
+        degree = int(sql[sql.rfind(",") + 1 : sql.rfind(")")])
+        return cls(degree), col_label
+
+    def fit(
+        self,
+        X: Union[bpd.DataFrame, bpd.Series],
+        y=None,  # ignored
+    ) -> PolynomialFeatures:
+        (X,) = utils.convert_to_dataframe(X)
+
+        compiled_transforms = self._compile_to_sql(X.columns.tolist())
+        transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms]
+
+        self._bqml_model = self._bqml_model_factory.create_model(
+            X,
+            options={"model_type": "transform_only"},
+            transforms=transform_sqls,
+        )
+
+        # TODO(garrettwu): generalize the approach to other transformers
+        output_names = []
+        for transform_col in self._bqml_model._model._properties["transformColumns"]:
+            transform_col_dict = cast(dict, transform_col)
+            # pass the columns that are not transformed
+            if "transformSql" not in transform_col_dict:
+                continue
+            transform_sql: str = transform_col_dict["transformSql"]
+            if not transform_sql.startswith("ML."):
+                continue
+
+            output_names.append(transform_col_dict["name"])
+
+        self._output_names = output_names
+
+        return self
+
+    def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
+        if not self._bqml_model:
+            raise RuntimeError("Must be fitted before transform")
+
+        (X,) = utils.convert_to_dataframe(X)
+
+        df = self._bqml_model.transform(X)
+        return typing.cast(
+            bpd.DataFrame,
+            df[self._output_names],
+        )
+
+    # TODO(garrettwu): to_gbq()
+
+
 PreprocessingType = Union[
     OneHotEncoder,
     StandardScaler,
diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py
@@ -73,6 +73,11 @@ def struct_options(self, **kwargs: Union[int, float]) -> str:
         """Encode a BQ STRUCT as options."""
         return f"STRUCT({self.build_structs(**kwargs)})"
 
+    def struct_columns(self, columns: Iterable[str]) -> str:
+        """Encode a BQ Table columns to a STRUCT."""
+        columns_str = ", ".join(columns)
+        return f"STRUCT({columns_str})"
+
     def input(self, **kwargs: str) -> str:
         """Encode a BQML INPUT clause."""
         return f"INPUT({self.build_schema(**kwargs)})"
@@ -153,6 +158,13 @@ def ml_label_encoder(
         https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-label-encoder for params."""
         return f"""ML.LABEL_ENCODER({numeric_expr_sql}, {top_k}, {frequency_threshold}) OVER() AS {name}"""
 
+    def ml_polynomial_expand(
+        self, columns: Iterable[str], degree: int, name: str
+    ) -> str:
+        """Encode ML.POLYNOMIAL_EXPAND.
+        https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-polynomial-expand"""
+        return f"""ML.POLYNOMIAL_EXPAND({self.struct_columns(columns)}, {degree}) AS {name}"""
+
     def ml_distance(
         self,
         col_x: str,
diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py
@@ -19,6 +19,7 @@
 
 import bigframes.features
 from bigframes.ml import preprocessing
+from tests.system import utils
 
 ONE_HOT_ENCODED_DTYPE = (
     pd.ArrowDtype(pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])))
@@ -840,3 +841,69 @@ def test_label_encoder_save_load(new_penguins_df, dataset_id):
 
 
 # TODO(garrettwu): add OneHotEncoder tests to compare with sklearn.
+
+
+def test_poly_features_default_params(new_penguins_df):
+    transformer = preprocessing.PolynomialFeatures()
+    df = new_penguins_df[["culmen_length_mm", "culmen_depth_mm"]]
+    transformer.fit(df)
+
+    result = transformer.transform(df).to_pandas()
+
+    expected = pd.DataFrame(
+        {
+            "poly_feat_culmen_length_mm": [
+                39.5,
+                38.5,
+                37.9,
+            ],
+            "poly_feat_culmen_length_mm_culmen_length_mm": [
+                1560.25,
+                1482.25,
+                1436.41,
+            ],
+            "poly_feat_culmen_length_mm_culmen_depth_mm": [
+                742.6,
+                662.2,
+                685.99,
+            ],
+            "poly_feat_culmen_depth_mm": [
+                18.8,
+                17.2,
+                18.1,
+            ],
+            "poly_feat_culmen_depth_mm_culmen_depth_mm": [
+                353.44,
+                295.84,
+                327.61,
+            ],
+        },
+        dtype="Float64",
+        index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+    )
+
+    pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1)
+
+
+def test_poly_features_params(new_penguins_df):
+    transformer = preprocessing.PolynomialFeatures(degree=3)
+    df = new_penguins_df[["culmen_length_mm", "culmen_depth_mm"]]
+    transformer.fit(df)
+
+    result = transformer.transform(df).to_pandas()
+
+    utils.check_pandas_df_schema_and_index(
+        result,
+        [
+            "poly_feat_culmen_length_mm",
+            "poly_feat_culmen_length_mm_culmen_length_mm",
+            "poly_feat_culmen_length_mm_culmen_length_mm_culmen_length_mm",
+            "poly_feat_culmen_length_mm_culmen_length_mm_culmen_depth_mm",
+            "poly_feat_culmen_length_mm_culmen_depth_mm",
+            "poly_feat_culmen_length_mm_culmen_depth_mm_culmen_depth_mm",
+            "poly_feat_culmen_depth_mm",
+            "poly_feat_culmen_depth_mm_culmen_depth_mm",
+            "poly_feat_culmen_depth_mm_culmen_depth_mm_culmen_depth_mm",
+        ],
+        [1633, 1672, 1690],
+    )
diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py
@@ -145,6 +145,13 @@ def test_label_encoder_correct(
     assert sql == "ML.LABEL_ENCODER(col_a, 1000000, 0) OVER() AS encoded_col_a"
 
 
+def test_polynomial_expand(
+    base_sql_generator: ml_sql.BaseSqlGenerator,
+):
+    sql = base_sql_generator.ml_polynomial_expand(["col_a", "col_b"], 2, "poly_exp")
+    assert sql == "ML.POLYNOMIAL_EXPAND(STRUCT(col_a, col_b), 2) AS poly_exp"
+
+
 def test_distance_correct(
     base_sql_generator: ml_sql.BaseSqlGenerator,
     mock_df: bpd.DataFrame,
diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py b/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py
@@ -0,0 +1,38 @@
+"""
+This file contains preprocessing tools based on polynomials.
+"""
+
+from bigframes_vendored.sklearn.base import BaseEstimator, TransformerMixin
+
+from bigframes import constants
+
+
+class PolynomialFeatures(TransformerMixin, BaseEstimator):
+    """Generate polynomial and interaction features."""
+
+    def fit(self, X, y=None):
+        """Compute number of output features.
+
+        Args:
+            X (bigframes.dataframe.DataFrame or bigframes.series.Series):
+                The Dataframe or Series with training data.
+
+            y (default None):
+                Ignored.
+
+        Returns:
+            PolynomialFeatures: Fitted transformer.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+    def transform(self, X):
+        """Transform data to polynomial features.
+
+        Args:
+            X (bigframes.dataframe.DataFrame or bigframes.series.Series):
+                The DataFrame or Series to be transformed.
+
+        Returns:
+           bigframes.dataframe.DataFrame: Transformed result.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)