Skip to content

Commit 392113b

Browse files
authored
feat: add ml.preprocessing.MinMaxScaler (#64)
* feat: add ml.preprocessing.MinMaxScaler * fix comments and typo * add test check for min value * nit fix
1 parent e804e13 commit 392113b

File tree

9 files changed

+302
-6
lines changed

9 files changed

+302
-6
lines changed

bigframes/ml/compose.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
preprocessing.OneHotEncoder,
3131
preprocessing.StandardScaler,
3232
preprocessing.MaxAbsScaler,
33+
preprocessing.MinMaxScaler,
3334
preprocessing.LabelEncoder,
3435
]
3536

bigframes/ml/pipeline.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]):
5151
preprocessing.StandardScaler,
5252
preprocessing.OneHotEncoder,
5353
preprocessing.MaxAbsScaler,
54+
preprocessing.MinMaxScaler,
5455
preprocessing.LabelEncoder,
5556
),
5657
):
@@ -149,6 +150,7 @@ def _extract_as_column_transformer(
149150
preprocessing.OneHotEncoder,
150151
preprocessing.StandardScaler,
151152
preprocessing.MaxAbsScaler,
153+
preprocessing.MinMaxScaler,
152154
preprocessing.LabelEncoder,
153155
],
154156
Union[str, List[str]],
@@ -177,10 +179,17 @@ def _extract_as_column_transformer(
177179
elif transform_sql.startswith("ML.MAX_ABS_SCALER"):
178180
transformers.append(
179181
(
180-
"max_abs_encoder",
182+
"max_abs_scaler",
181183
*preprocessing.MaxAbsScaler._parse_from_sql(transform_sql),
182184
)
183185
)
186+
elif transform_sql.startswith("ML.MIN_MAX_SCALER"):
187+
transformers.append(
188+
(
189+
"min_max_scaler",
190+
*preprocessing.MinMaxScaler._parse_from_sql(transform_sql),
191+
)
192+
)
184193
elif transform_sql.startswith("ML.LABEL_ENCODER"):
185194
transformers.append(
186195
(
@@ -203,6 +212,7 @@ def _merge_column_transformer(
203212
preprocessing.StandardScaler,
204213
preprocessing.OneHotEncoder,
205214
preprocessing.MaxAbsScaler,
215+
preprocessing.MinMaxScaler,
206216
preprocessing.LabelEncoder,
207217
]:
208218
"""Try to merge the column transformer to a simple transformer."""

bigframes/ml/preprocessing.py

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,13 +144,13 @@ def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]:
144144

145145
@classmethod
146146
def _parse_from_sql(cls, sql: str) -> tuple[MaxAbsScaler, str]:
147-
"""Parse SQL to tuple(StandardScaler, column_label).
147+
"""Parse SQL to tuple(MaxAbsScaler, column_label).
148148
149149
Args:
150150
sql: SQL string of format "ML.MAX_ABS_SCALER({col_label}) OVER()"
151151
152152
Returns:
153-
tuple(StandardScaler, column_label)"""
153+
tuple(MaxAbsScaler, column_label)"""
154154
col_label = sql[sql.find("(") + 1 : sql.find(")")]
155155
return cls(), col_label
156156

@@ -187,6 +187,86 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
187187
)
188188

189189

190+
class MinMaxScaler(
191+
base.Transformer,
192+
third_party.bigframes_vendored.sklearn.preprocessing._data.MinMaxScaler,
193+
):
194+
__doc__ = (
195+
third_party.bigframes_vendored.sklearn.preprocessing._data.MinMaxScaler.__doc__
196+
)
197+
198+
def __init__(self):
199+
self._bqml_model: Optional[core.BqmlModel] = None
200+
self._bqml_model_factory = globals.bqml_model_factory()
201+
self._base_sql_generator = globals.base_sql_generator()
202+
203+
# TODO(garrettwu): implement __hash__
204+
def __eq__(self, other: Any) -> bool:
205+
return type(other) is MinMaxScaler and self._bqml_model == other._bqml_model
206+
207+
def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]:
208+
"""Compile this transformer to a list of SQL expressions that can be included in
209+
a BQML TRANSFORM clause
210+
211+
Args:
212+
columns: a list of column names to transform
213+
214+
Returns: a list of tuples of (sql_expression, output_name)"""
215+
return [
216+
(
217+
self._base_sql_generator.ml_min_max_scaler(
218+
column, f"min_max_scaled_{column}"
219+
),
220+
f"min_max_scaled_{column}",
221+
)
222+
for column in columns
223+
]
224+
225+
@classmethod
226+
def _parse_from_sql(cls, sql: str) -> tuple[MinMaxScaler, str]:
227+
"""Parse SQL to tuple(MinMaxScaler, column_label).
228+
229+
Args:
230+
sql: SQL string of format "ML.MIN_MAX_SCALER({col_label}) OVER()"
231+
232+
Returns:
233+
tuple(MinMaxScaler, column_label)"""
234+
col_label = sql[sql.find("(") + 1 : sql.find(")")]
235+
return cls(), col_label
236+
237+
def fit(
238+
self,
239+
X: Union[bpd.DataFrame, bpd.Series],
240+
y=None, # ignored
241+
) -> MinMaxScaler:
242+
(X,) = utils.convert_to_dataframe(X)
243+
244+
compiled_transforms = self._compile_to_sql(X.columns.tolist())
245+
transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms]
246+
247+
self._bqml_model = self._bqml_model_factory.create_model(
248+
X,
249+
options={"model_type": "transform_only"},
250+
transforms=transform_sqls,
251+
)
252+
253+
# The schema of TRANSFORM output is not available in the model API, so save it during fitting
254+
self._output_names = [name for _, name in compiled_transforms]
255+
return self
256+
257+
def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
258+
if not self._bqml_model:
259+
raise RuntimeError("Must be fitted before transform")
260+
261+
(X,) = utils.convert_to_dataframe(X)
262+
263+
df = self._bqml_model.transform(X)
264+
return typing.cast(
265+
bpd.DataFrame,
266+
df[self._output_names],
267+
)
268+
269+
190270
class OneHotEncoder(
191271
base.Transformer,
192272
third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder,

bigframes/ml/sql.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ def ml_max_abs_scaler(self, numeric_expr_sql: str, name: str) -> str:
8181
"""Encode ML.MAX_ABS_SCALER for BQML"""
8282
return f"""ML.MAX_ABS_SCALER({numeric_expr_sql}) OVER() AS {name}"""
8383

84+
def ml_min_max_scaler(self, numeric_expr_sql: str, name: str) -> str:
85+
"""Encode ML.MIN_MAX_SCALER for BQML"""
86+
return f"""ML.MIN_MAX_SCALER({numeric_expr_sql}) OVER() AS {name}"""
87+
8488
def ml_one_hot_encoder(
8589
self,
8690
numeric_expr_sql: str,

tests/system/large/ml/test_pipeline.py

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,11 @@ def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_ind
575575
preprocessing.MaxAbsScaler(),
576576
["culmen_length_mm", "flipper_length_mm"],
577577
),
578+
(
579+
"min_max_scale",
580+
preprocessing.MinMaxScaler(),
581+
["culmen_length_mm", "flipper_length_mm"],
582+
),
578583
(
579584
"label",
580585
preprocessing.LabelEncoder(),
@@ -647,6 +652,11 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id
647652
preprocessing.MaxAbsScaler(),
648653
["culmen_length_mm", "flipper_length_mm"],
649654
),
655+
(
656+
"min_max_scale",
657+
preprocessing.MinMaxScaler(),
658+
["culmen_length_mm", "flipper_length_mm"],
659+
),
650660
(
651661
"label",
652662
preprocessing.LabelEncoder(),
@@ -684,9 +694,11 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id
684694
"species",
685695
),
686696
("standard_scaler", preprocessing.StandardScaler(), "culmen_length_mm"),
687-
("max_abs_encoder", preprocessing.MaxAbsScaler(), "culmen_length_mm"),
697+
("max_abs_scaler", preprocessing.MaxAbsScaler(), "culmen_length_mm"),
698+
("min_max_scaler", preprocessing.MinMaxScaler(), "culmen_length_mm"),
688699
("standard_scaler", preprocessing.StandardScaler(), "flipper_length_mm"),
689-
("max_abs_encoder", preprocessing.MaxAbsScaler(), "flipper_length_mm"),
700+
("max_abs_scaler", preprocessing.MaxAbsScaler(), "flipper_length_mm"),
701+
("min_max_scaler", preprocessing.MinMaxScaler(), "flipper_length_mm"),
690702
]
691703

692704
assert transformers == expected
@@ -743,14 +755,42 @@ def test_pipeline_max_abs_scaler_to_gbq(penguins_df_default_index, dataset_id):
743755
pl.fit(X_train, y_train)
744756

745757
pl_loaded = pl.to_gbq(
746-
f"{dataset_id}.test_penguins_pipeline_standard_scaler", replace=True
758+
f"{dataset_id}.test_penguins_pipeline_min_max_scaler", replace=True
747759
)
748760
assert isinstance(pl_loaded._transform, preprocessing.MaxAbsScaler)
749761

750762
assert isinstance(pl_loaded._estimator, linear_model.LinearRegression)
751763
assert pl_loaded._estimator.fit_intercept is False
752764

753765

766+
def test_pipeline_min_max_scaler_to_gbq(penguins_df_default_index, dataset_id):
767+
pl = pipeline.Pipeline(
768+
[
769+
("transform", preprocessing.MinMaxScaler()),
770+
("estimator", linear_model.LinearRegression(fit_intercept=False)),
771+
]
772+
)
773+
774+
df = penguins_df_default_index.dropna()
775+
X_train = df[
776+
[
777+
"culmen_length_mm",
778+
"culmen_depth_mm",
779+
"flipper_length_mm",
780+
]
781+
]
782+
y_train = df[["body_mass_g"]]
783+
pl.fit(X_train, y_train)
784+
785+
pl_loaded = pl.to_gbq(
786+
f"{dataset_id}.test_penguins_pipeline_min_max_scaler", replace=True
787+
)
788+
assert isinstance(pl_loaded._transform, preprocessing.MinMaxScaler)
789+
790+
assert isinstance(pl_loaded._estimator, linear_model.LinearRegression)
791+
assert pl_loaded._estimator.fit_intercept is False
792+
793+
754794
def test_pipeline_one_hot_encoder_to_gbq(penguins_df_default_index, dataset_id):
755795
pl = pipeline.Pipeline(
756796
[

tests/system/small/ml/test_preprocessing.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,99 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin
211211
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
212212

213213

214+
def test_min_max_scaler_normalizeds_fit_transform(new_penguins_df):
215+
scaler = bigframes.ml.preprocessing.MinMaxScaler()
216+
result = scaler.fit_transform(
217+
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
218+
).to_pandas()
219+
220+
# TODO: bug? feature columns seem to be in nondeterministic random order
221+
# workaround: sort columns by name. Can't repro it in pantheon, so could
222+
# be a bigframes issue...
223+
result = result.reindex(sorted(result.columns), axis=1)
224+
225+
expected = pd.DataFrame(
226+
{
227+
"min_max_scaled_culmen_depth_mm": [1.0, 0.0, 0.5625],
228+
"min_max_scaled_culmen_length_mm": [1.0, 0.375, 0.0],
229+
"min_max_scaled_flipper_length_mm": [1.0, 0.0, 0.466667],
230+
},
231+
dtype="Float64",
232+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
233+
)
234+
235+
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
236+
237+
238+
def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguins_df):
239+
scaler = bigframes.ml.preprocessing.MinMaxScaler()
240+
scaler.fit(penguins_df_default_index["culmen_length_mm"])
241+
242+
result = scaler.transform(penguins_df_default_index["culmen_length_mm"]).to_pandas()
243+
244+
# If minmax-scaled correctly, min should be 0 and max should be 1.
245+
for column in result.columns:
246+
assert math.isclose(result[column].max(), 1.0, abs_tol=1e-3)
247+
assert math.isclose(result[column].min(), 0.0, abs_tol=1e-3)
248+
249+
result = scaler.transform(new_penguins_df).to_pandas()
250+
251+
# TODO: bug? feature columns seem to be in nondeterministic random order
252+
# workaround: sort columns by name. Can't repro it in pantheon, so could
253+
# be a bigframes issue...
254+
result = result.reindex(sorted(result.columns), axis=1)
255+
256+
expected = pd.DataFrame(
257+
{
258+
"min_max_scaled_culmen_length_mm": [0.269091, 0.232727, 0.210909],
259+
},
260+
dtype="Float64",
261+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
262+
)
263+
264+
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
265+
266+
267+
def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df):
268+
# TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod.
269+
scaler = bigframes.ml.preprocessing.MinMaxScaler()
270+
scaler.fit(
271+
penguins_df_default_index[
272+
["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]
273+
]
274+
)
275+
276+
result = scaler.transform(
277+
penguins_df_default_index[
278+
["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]
279+
]
280+
).to_pandas()
281+
282+
# If minmax-scaled correctly, min should be 0 and max should be 1.
283+
for column in result.columns:
284+
assert math.isclose(result[column].max(), 1.0, abs_tol=1e-3)
285+
assert math.isclose(result[column].min(), 0.0, abs_tol=1e-3)
286+
287+
result = scaler.transform(new_penguins_df).to_pandas()
288+
289+
# TODO: bug? feature columns seem to be in nondeterministic random order
290+
# workaround: sort columns by name. Can't repro it in pantheon, so could
291+
# be a bigframes issue...
292+
result = result.reindex(sorted(result.columns), axis=1)
293+
294+
expected = pd.DataFrame(
295+
{
296+
"min_max_scaled_culmen_depth_mm": [0.678571, 0.4880952, 0.595238],
297+
"min_max_scaled_culmen_length_mm": [0.269091, 0.232727, 0.210909],
298+
"min_max_scaled_flipper_length_mm": [0.40678, 0.152542, 0.271186],
299+
},
300+
dtype="Float64",
301+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
302+
)
303+
304+
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
305+
306+
214307
def test_one_hot_encoder_default_params(new_penguins_df):
215308
encoder = bigframes.ml.preprocessing.OneHotEncoder()
216309
encoder.fit(new_penguins_df[["species", "sex"]])

0 commit comments

Comments
 (0)