Skip to content

Commit 59515e1

Browse files
authored
[wip][docs] update FrameworkPredictor examples (#38634)
Signed-off-by: Matthew Deng <matt@anyscale.com> Signed-off-by: matthewdeng <matt@anyscale.com>
1 parent db2b8d2 commit 59515e1

File tree

5 files changed

+43
-33
lines changed

5 files changed

+43
-33
lines changed

doc/source/ray-air/examples/BUILD

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,6 @@ py_test_run_all_notebooks(
2323
"stablediffusion_batch_prediction.ipynb", # Requires GPUs
2424
"gptj_deepspeed_fine_tuning.ipynb", # Requires release test
2525
"dolly_lightning_fsdp_finetuning.ipynb", # Requires release test
26-
# TODO(matthewdeng): Re-enable after XGBoost and LightGBM use new train.report
27-
# codepath and new checkpointing flow is enabled for the examples.
28-
"lightgbm_example.ipynb",
29-
"xgboost_example.ipynb",
3026
],
3127
data = ["//doc/source/ray-air/examples:air_examples"],
3228
tags = ["exclusive", "team:ml", "ray_air"],

doc/source/ray-air/examples/lightgbm_example.ipynb

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,10 @@
7171
"from typing import Tuple\n",
7272
"\n",
7373
"import ray\n",
74-
"from ray.train.lightgbm import LightGBMPredictor\n",
75-
"from ray.data.preprocessors.chain import Chain\n",
76-
"from ray.data.preprocessors.encoder import Categorizer\n",
74+
"from ray.data import Dataset, Preprocessor\n",
75+
"from ray.data.preprocessors import Categorizer, StandardScaler\n",
7776
"from ray.train.lightgbm import LightGBMTrainer\n",
78-
"from ray.train import Result, ScalingConfig\n",
79-
"from ray.data import Dataset\n",
80-
"from ray.data.preprocessors import StandardScaler"
77+
"from ray.train import Result, ScalingConfig"
8178
]
8279
},
8380
{
@@ -124,10 +121,11 @@
124121
"\n",
125122
" # Scale some random columns, and categorify the categorical_column,\n",
126123
" # allowing LightGBM to use its built-in categorical feature support\n",
127-
" preprocessor = Chain(\n",
128-
" Categorizer([\"categorical_column\"]), \n",
129-
" StandardScaler(columns=[\"mean radius\", \"mean texture\"])\n",
130-
" )\n",
124+
" scaler = StandardScaler(columns=[\"mean radius\", \"mean texture\"])\n",
125+
" categorizer = Categorizer([\"categorical_column\"])\n",
126+
"\n",
127+
" train_dataset = categorizer.fit_transform(scaler.fit_transform(train_dataset))\n",
128+
" valid_dataset = categorizer.transform(scaler.transform(valid_dataset))\n",
131129
"\n",
132130
" # LightGBM specific params\n",
133131
" params = {\n",
@@ -140,8 +138,8 @@
140138
" label_column=\"target\",\n",
141139
" params=params,\n",
142140
" datasets={\"train\": train_dataset, \"valid\": valid_dataset},\n",
143-
" preprocessor=preprocessor,\n",
144141
" num_boost_round=100,\n",
142+
" metadata = {\"scaler_pkl\": scaler.serialize(), \"categorizer_pkl\": categorizer.serialize()}\n",
145143
" )\n",
146144
" result = trainer.fit()\n",
147145
" print(result.metrics)\n",
@@ -173,10 +171,13 @@
173171
"class Predict:\n",
174172
"\n",
175173
" def __init__(self, checkpoint: Checkpoint):\n",
176-
" self.predictor = LightGBMPredictor.from_checkpoint(checkpoint)\n",
174+
" self.model = LightGBMTrainer.get_model(checkpoint)\n",
175+
" self.scaler = Preprocessor.deserialize(checkpoint.get_metadata()[\"scaler_pkl\"])\n",
176+
" self.categorizer = Preprocessor.deserialize(checkpoint.get_metadata()[\"categorizer_pkl\"])\n",
177177
"\n",
178178
" def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:\n",
179-
" return self.predictor.predict(batch)\n",
179+
" preprocessed_batch = self.categorizer.transform_batch(self.scaler.transform_batch(batch))\n",
180+
" return {\"predictions\": self.model.predict(preprocessed_batch)}\n",
180181
"\n",
181182
"\n",
182183
"def predict_lightgbm(result: Result):\n",

doc/source/ray-air/examples/xgboost_example.ipynb

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
}
5151
],
5252
"source": [
53-
"!pip install -qU \"ray[tune]\" xgboost_ray"
53+
"!pip install -qU \"ray[data,train]\" xgboost_ray"
5454
]
5555
},
5656
{
@@ -76,11 +76,11 @@
7676
"from typing import Tuple\n",
7777
"\n",
7878
"import ray\n",
79-
"from ray.train.xgboost import XGBoostPredictor\n",
79+
"from ray.data import Dataset, Preprocessor\n",
80+
"from ray.data.preprocessors import StandardScaler\n",
8081
"from ray.train.xgboost import XGBoostTrainer\n",
8182
"from ray.train import Result, ScalingConfig\n",
82-
"from ray.data import Dataset\n",
83-
"from ray.data.preprocessors import StandardScaler"
83+
"import xgboost"
8484
]
8585
},
8686
{
@@ -140,6 +140,8 @@
140140
" # Scale some random columns\n",
141141
" columns_to_scale = [\"mean radius\", \"mean texture\"]\n",
142142
" preprocessor = StandardScaler(columns=columns_to_scale)\n",
143+
" train_dataset = preprocessor.fit_transform(train_dataset)\n",
144+
" valid_dataset = preprocessor.transform(valid_dataset)\n",
143145
"\n",
144146
" # XGBoost specific params\n",
145147
" params = {\n",
@@ -153,8 +155,8 @@
153155
" label_column=\"target\",\n",
154156
" params=params,\n",
155157
" datasets={\"train\": train_dataset, \"valid\": valid_dataset},\n",
156-
" preprocessor=preprocessor,\n",
157158
" num_boost_round=100,\n",
159+
" metadata = {\"preprocessor_pkl\": preprocessor.serialize()}\n",
158160
" )\n",
159161
" result = trainer.fit()\n",
160162
" print(result.metrics)\n",
@@ -190,10 +192,13 @@
190192
"class Predict:\n",
191193
"\n",
192194
" def __init__(self, checkpoint: Checkpoint):\n",
193-
" self.predictor = XGBoostPredictor.from_checkpoint(checkpoint)\n",
195+
" self.model = XGBoostTrainer.get_model(checkpoint)\n",
196+
" self.preprocessor = Preprocessor.deserialize(checkpoint.get_metadata()[\"preprocessor_pkl\"])\n",
194197
"\n",
195198
" def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:\n",
196-
" return self.predictor.predict(batch)\n",
199+
" preprocessed_batch = self.preprocessor.transform_batch(batch)\n",
200+
" dmatrix = xgboost.DMatrix(preprocessed_batch)\n",
201+
" return {\"predictions\": self.model.predict(dmatrix)}\n",
197202
"\n",
198203
"\n",
199204
"def predict_xgboost(result: Result):\n",

python/ray/data/preprocessor.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import abc
2+
import base64
23
import collections
4+
import pickle
35
import warnings
46
from enum import Enum
57
from typing import TYPE_CHECKING, Any, Dict, Optional, Union
@@ -326,3 +328,18 @@ def preferred_batch_format(cls) -> BatchFormat:
326328
path is the most optimal.
327329
"""
328330
return BatchFormat.PANDAS
331+
332+
@DeveloperAPI
333+
def serialize(self) -> str:
334+
"""Return this preprocessor serialized as a string.
335+
Note: this is not a stable serialization format as it uses `pickle`.
336+
"""
337+
# Convert it to a plain string so that it can be included as JSON metadata
338+
# in Trainer checkpoints.
339+
return base64.b64encode(pickle.dumps(self)).decode("ascii")
340+
341+
@staticmethod
342+
@DeveloperAPI
343+
def deserialize(serialized: str) -> "Preprocessor":
344+
"""Load the original preprocessor serialized via `self.serialize()`."""
345+
return pickle.loads(base64.b64decode(serialized))

python/ray/data/preprocessors/chain.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,13 @@
1-
import warnings
21
from typing import TYPE_CHECKING, Union
32

43
from ray.air.util.data_batch_conversion import BatchFormat
54
from ray.data import Dataset, DatasetPipeline
65
from ray.data.preprocessor import Preprocessor
7-
from ray.util.annotations import Deprecated
86

97
if TYPE_CHECKING:
108
from ray.air.data_batch_type import DataBatchType
119

12-
CHAIN_DEPRECATION_MESSAGE = (
13-
"The Chain preprocessor is deprecated as of Ray 2.7. Instead, manually apply your "
14-
"sequence of Preprocessor `fit` and `transform` calls directly on the Ray Dataset."
15-
)
1610

17-
18-
@Deprecated(message=CHAIN_DEPRECATION_MESSAGE)
1911
class Chain(Preprocessor):
2012
"""Combine multiple preprocessors into a single :py:class:`Preprocessor`.
2113
@@ -74,7 +66,6 @@ def fit_status(self):
7466
return Preprocessor.FitStatus.NOT_FITTABLE
7567

7668
def __init__(self, *preprocessors: Preprocessor):
77-
warnings.warn(CHAIN_DEPRECATION_MESSAGE, DeprecationWarning, stacklevel=2)
7869
self.preprocessors = preprocessors
7970

8071
def _fit(self, ds: Dataset) -> Preprocessor:

0 commit comments

Comments
 (0)