Skip to content

Commit b4fbb51

Browse files
authored
feat: add ml.preprocessing.PolynomialFeatures class (#793)
1 parent 1b96b80 commit b4fbb51

File tree

5 files changed

+228
-0
lines changed

5 files changed

+228
-0
lines changed

bigframes/ml/preprocessing.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import bigframes_vendored.sklearn.preprocessing._discretization
2525
import bigframes_vendored.sklearn.preprocessing._encoder
2626
import bigframes_vendored.sklearn.preprocessing._label
27+
import bigframes_vendored.sklearn.preprocessing._polynomial
2728

2829
from bigframes.core import log_adapter
2930
from bigframes.ml import base, core, globals, utils
@@ -661,6 +662,109 @@ def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
661662
)
662663

663664

665+
@log_adapter.class_logger
666+
class PolynomialFeatures(
667+
base.Transformer,
668+
bigframes_vendored.sklearn.preprocessing._polynomial.PolynomialFeatures,
669+
):
670+
__doc__ = (
671+
bigframes_vendored.sklearn.preprocessing._polynomial.PolynomialFeatures.__doc__
672+
)
673+
674+
def __init__(self, degree: int = 2):
675+
self.degree = degree
676+
self._bqml_model: Optional[core.BqmlModel] = None
677+
self._bqml_model_factory = globals.bqml_model_factory()
678+
self._base_sql_generator = globals.base_sql_generator()
679+
680+
# TODO(garrettwu): implement __hash__
681+
def __eq__(self, other: Any) -> bool:
682+
return (
683+
type(other) is PolynomialFeatures and self._bqml_model == other._bqml_model
684+
)
685+
686+
def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]:
687+
"""Compile this transformer to a list of SQL expressions that can be included in
688+
a BQML TRANSFORM clause
689+
690+
Args:
691+
columns:
692+
a list of column names to transform.
693+
X (default None):
694+
Ignored.
695+
696+
Returns: a list of tuples of (sql_expression, output_name)"""
697+
output_name = "poly_feat"
698+
return [
699+
(
700+
self._base_sql_generator.ml_polynomial_expand(
701+
columns, self.degree, output_name
702+
),
703+
output_name,
704+
)
705+
]
706+
707+
@classmethod
708+
def _parse_from_sql(cls, sql: str) -> tuple[PolynomialFeatures, str]:
709+
"""Parse SQL to tuple(PolynomialFeatures, column_label).
710+
711+
Args:
712+
sql: SQL string of format "ML.POLYNOMIAL_EXPAND(STRUCT(col_label0, col_label1, ...), degree)"
713+
714+
Returns:
715+
tuple(MaxAbsScaler, column_label)"""
716+
col_label = sql[sql.find("STRUCT(") + 7 : sql.find(")")]
717+
degree = int(sql[sql.rfind(",") + 1 : sql.rfind(")")])
718+
return cls(degree), col_label
719+
720+
def fit(
721+
self,
722+
X: Union[bpd.DataFrame, bpd.Series],
723+
y=None, # ignored
724+
) -> PolynomialFeatures:
725+
(X,) = utils.convert_to_dataframe(X)
726+
727+
compiled_transforms = self._compile_to_sql(X.columns.tolist())
728+
transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms]
729+
730+
self._bqml_model = self._bqml_model_factory.create_model(
731+
X,
732+
options={"model_type": "transform_only"},
733+
transforms=transform_sqls,
734+
)
735+
736+
# TODO(garrettwu): generalize the approach to other transformers
737+
output_names = []
738+
for transform_col in self._bqml_model._model._properties["transformColumns"]:
739+
transform_col_dict = cast(dict, transform_col)
740+
# pass the columns that are not transformed
741+
if "transformSql" not in transform_col_dict:
742+
continue
743+
transform_sql: str = transform_col_dict["transformSql"]
744+
if not transform_sql.startswith("ML."):
745+
continue
746+
747+
output_names.append(transform_col_dict["name"])
748+
749+
self._output_names = output_names
750+
751+
return self
752+
753+
def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
754+
if not self._bqml_model:
755+
raise RuntimeError("Must be fitted before transform")
756+
757+
(X,) = utils.convert_to_dataframe(X)
758+
759+
df = self._bqml_model.transform(X)
760+
return typing.cast(
761+
bpd.DataFrame,
762+
df[self._output_names],
763+
)
764+
765+
# TODO(garrettwu): to_gbq()
766+
767+
664768
PreprocessingType = Union[
665769
OneHotEncoder,
666770
StandardScaler,

bigframes/ml/sql.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,11 @@ def struct_options(self, **kwargs: Union[int, float]) -> str:
7373
"""Encode a BQ STRUCT as options."""
7474
return f"STRUCT({self.build_structs(**kwargs)})"
7575

76+
def struct_columns(self, columns: Iterable[str]) -> str:
77+
"""Encode a BQ Table columns to a STRUCT."""
78+
columns_str = ", ".join(columns)
79+
return f"STRUCT({columns_str})"
80+
7681
def input(self, **kwargs: str) -> str:
7782
"""Encode a BQML INPUT clause."""
7883
return f"INPUT({self.build_schema(**kwargs)})"
@@ -153,6 +158,13 @@ def ml_label_encoder(
153158
https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-label-encoder for params."""
154159
return f"""ML.LABEL_ENCODER({numeric_expr_sql}, {top_k}, {frequency_threshold}) OVER() AS {name}"""
155160

161+
def ml_polynomial_expand(
162+
self, columns: Iterable[str], degree: int, name: str
163+
) -> str:
164+
"""Encode ML.POLYNOMIAL_EXPAND.
165+
https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-polynomial-expand"""
166+
return f"""ML.POLYNOMIAL_EXPAND({self.struct_columns(columns)}, {degree}) AS {name}"""
167+
156168
def ml_distance(
157169
self,
158170
col_x: str,

tests/system/small/ml/test_preprocessing.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import bigframes.features
2121
from bigframes.ml import preprocessing
22+
from tests.system import utils
2223

2324
ONE_HOT_ENCODED_DTYPE = (
2425
pd.ArrowDtype(pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])))
@@ -840,3 +841,69 @@ def test_label_encoder_save_load(new_penguins_df, dataset_id):
840841

841842

842843
# TODO(garrettwu): add OneHotEncoder tests to compare with sklearn.
844+
845+
846+
def test_poly_features_default_params(new_penguins_df):
847+
transformer = preprocessing.PolynomialFeatures()
848+
df = new_penguins_df[["culmen_length_mm", "culmen_depth_mm"]]
849+
transformer.fit(df)
850+
851+
result = transformer.transform(df).to_pandas()
852+
853+
expected = pd.DataFrame(
854+
{
855+
"poly_feat_culmen_length_mm": [
856+
39.5,
857+
38.5,
858+
37.9,
859+
],
860+
"poly_feat_culmen_length_mm_culmen_length_mm": [
861+
1560.25,
862+
1482.25,
863+
1436.41,
864+
],
865+
"poly_feat_culmen_length_mm_culmen_depth_mm": [
866+
742.6,
867+
662.2,
868+
685.99,
869+
],
870+
"poly_feat_culmen_depth_mm": [
871+
18.8,
872+
17.2,
873+
18.1,
874+
],
875+
"poly_feat_culmen_depth_mm_culmen_depth_mm": [
876+
353.44,
877+
295.84,
878+
327.61,
879+
],
880+
},
881+
dtype="Float64",
882+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
883+
)
884+
885+
pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1)
886+
887+
888+
def test_poly_features_params(new_penguins_df):
889+
transformer = preprocessing.PolynomialFeatures(degree=3)
890+
df = new_penguins_df[["culmen_length_mm", "culmen_depth_mm"]]
891+
transformer.fit(df)
892+
893+
result = transformer.transform(df).to_pandas()
894+
895+
utils.check_pandas_df_schema_and_index(
896+
result,
897+
[
898+
"poly_feat_culmen_length_mm",
899+
"poly_feat_culmen_length_mm_culmen_length_mm",
900+
"poly_feat_culmen_length_mm_culmen_length_mm_culmen_length_mm",
901+
"poly_feat_culmen_length_mm_culmen_length_mm_culmen_depth_mm",
902+
"poly_feat_culmen_length_mm_culmen_depth_mm",
903+
"poly_feat_culmen_length_mm_culmen_depth_mm_culmen_depth_mm",
904+
"poly_feat_culmen_depth_mm",
905+
"poly_feat_culmen_depth_mm_culmen_depth_mm",
906+
"poly_feat_culmen_depth_mm_culmen_depth_mm_culmen_depth_mm",
907+
],
908+
[1633, 1672, 1690],
909+
)

tests/unit/ml/test_sql.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,13 @@ def test_label_encoder_correct(
145145
assert sql == "ML.LABEL_ENCODER(col_a, 1000000, 0) OVER() AS encoded_col_a"
146146

147147

148+
def test_polynomial_expand(
149+
base_sql_generator: ml_sql.BaseSqlGenerator,
150+
):
151+
sql = base_sql_generator.ml_polynomial_expand(["col_a", "col_b"], 2, "poly_exp")
152+
assert sql == "ML.POLYNOMIAL_EXPAND(STRUCT(col_a, col_b), 2) AS poly_exp"
153+
154+
148155
def test_distance_correct(
149156
base_sql_generator: ml_sql.BaseSqlGenerator,
150157
mock_df: bpd.DataFrame,
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
"""
2+
This file contains preprocessing tools based on polynomials.
3+
"""
4+
5+
from bigframes_vendored.sklearn.base import BaseEstimator, TransformerMixin
6+
7+
from bigframes import constants
8+
9+
10+
class PolynomialFeatures(TransformerMixin, BaseEstimator):
11+
"""Generate polynomial and interaction features."""
12+
13+
def fit(self, X, y=None):
14+
"""Compute number of output features.
15+
16+
Args:
17+
X (bigframes.dataframe.DataFrame or bigframes.series.Series):
18+
The Dataframe or Series with training data.
19+
20+
y (default None):
21+
Ignored.
22+
23+
Returns:
24+
PolynomialFeatures: Fitted transformer.
25+
"""
26+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
27+
28+
def transform(self, X):
29+
"""Transform data to polynomial features.
30+
31+
Args:
32+
X (bigframes.dataframe.DataFrame or bigframes.series.Series):
33+
The DataFrame or Series to be transformed.
34+
35+
Returns:
36+
bigframes.dataframe.DataFrame: Transformed result.
37+
"""
38+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

0 commit comments

Comments
 (0)