[wip][docs] update FrameworkPredictor examples (#38634)

matthewdeng · web-flow · commit 59515e15abc4 · 2023-08-28T17:43:07.000-07:00
Signed-off-by: Matthew Deng &lt;matt@anyscale.com&gt;
Signed-off-by: matthewdeng &lt;matt@anyscale.com&gt;
diff --git a/doc/source/ray-air/examples/BUILD b/doc/source/ray-air/examples/BUILD
@@ -23,10 +23,6 @@ py_test_run_all_notebooks(
         "stablediffusion_batch_prediction.ipynb",  # Requires GPUs
         "gptj_deepspeed_fine_tuning.ipynb",  # Requires release test
         "dolly_lightning_fsdp_finetuning.ipynb", # Requires release test
-        # TODO(matthewdeng): Re-enable after XGBoost and LightGBM use new train.report
-        # codepath and new checkpointing flow is enabled for the examples.
-        "lightgbm_example.ipynb",
-        "xgboost_example.ipynb",
     ],
     data = ["//doc/source/ray-air/examples:air_examples"],
     tags = ["exclusive", "team:ml", "ray_air"],
diff --git a/doc/source/ray-air/examples/lightgbm_example.ipynb b/doc/source/ray-air/examples/lightgbm_example.ipynb
@@ -71,13 +71,10 @@
     "from typing import Tuple\n",
     "\n",
     "import ray\n",
-    "from ray.train.lightgbm import LightGBMPredictor\n",
-    "from ray.data.preprocessors.chain import Chain\n",
-    "from ray.data.preprocessors.encoder import Categorizer\n",
+    "from ray.data import Dataset, Preprocessor\n",
+    "from ray.data.preprocessors import Categorizer, StandardScaler\n",
     "from ray.train.lightgbm import LightGBMTrainer\n",
-    "from ray.train import Result, ScalingConfig\n",
-    "from ray.data import Dataset\n",
-    "from ray.data.preprocessors import StandardScaler"
+    "from ray.train import Result, ScalingConfig"
    ]
   },
   {
@@ -124,10 +121,11 @@
     "\n",
     "    # Scale some random columns, and categorify the categorical_column,\n",
     "    # allowing LightGBM to use its built-in categorical feature support\n",
-    "    preprocessor = Chain(\n",
-    "        Categorizer([\"categorical_column\"]), \n",
-    "        StandardScaler(columns=[\"mean radius\", \"mean texture\"])\n",
-    "    )\n",
+    "    scaler = StandardScaler(columns=[\"mean radius\", \"mean texture\"])\n",
+    "    categorizer = Categorizer([\"categorical_column\"])\n",
+    "\n",
+    "    train_dataset = categorizer.fit_transform(scaler.fit_transform(train_dataset))\n",
+    "    valid_dataset = categorizer.transform(scaler.transform(valid_dataset))\n",
     "\n",
     "    # LightGBM specific params\n",
     "    params = {\n",
@@ -140,8 +138,8 @@
     "        label_column=\"target\",\n",
     "        params=params,\n",
     "        datasets={\"train\": train_dataset, \"valid\": valid_dataset},\n",
-    "        preprocessor=preprocessor,\n",
     "        num_boost_round=100,\n",
+    "        metadata = {\"scaler_pkl\": scaler.serialize(), \"categorizer_pkl\": categorizer.serialize()}\n",
     "    )\n",
     "    result = trainer.fit()\n",
     "    print(result.metrics)\n",
@@ -173,10 +171,13 @@
     "class Predict:\n",
     "\n",
     "    def __init__(self, checkpoint: Checkpoint):\n",
-    "        self.predictor = LightGBMPredictor.from_checkpoint(checkpoint)\n",
+    "        self.model = LightGBMTrainer.get_model(checkpoint)\n",
+    "        self.scaler = Preprocessor.deserialize(checkpoint.get_metadata()[\"scaler_pkl\"])\n",
+    "        self.categorizer = Preprocessor.deserialize(checkpoint.get_metadata()[\"categorizer_pkl\"])\n",
     "\n",
     "    def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:\n",
-    "        return self.predictor.predict(batch)\n",
+    "        preprocessed_batch = self.categorizer.transform_batch(self.scaler.transform_batch(batch))\n",
+    "        return {\"predictions\": self.model.predict(preprocessed_batch)}\n",
     "\n",
     "\n",
     "def predict_lightgbm(result: Result):\n",
diff --git a/doc/source/ray-air/examples/xgboost_example.ipynb b/doc/source/ray-air/examples/xgboost_example.ipynb
@@ -50,7 +50,7 @@
     }
    ],
    "source": [
-    "!pip install -qU \"ray[tune]\" xgboost_ray"
+    "!pip install -qU \"ray[data,train]\" xgboost_ray"
    ]
   },
   {
@@ -76,11 +76,11 @@
     "from typing import Tuple\n",
     "\n",
     "import ray\n",
-    "from ray.train.xgboost import XGBoostPredictor\n",
+    "from ray.data import Dataset, Preprocessor\n",
+    "from ray.data.preprocessors import StandardScaler\n",
     "from ray.train.xgboost import XGBoostTrainer\n",
     "from ray.train import Result, ScalingConfig\n",
-    "from ray.data import Dataset\n",
-    "from ray.data.preprocessors import StandardScaler"
+    "import xgboost"
    ]
   },
   {
@@ -140,6 +140,8 @@
     "    # Scale some random columns\n",
     "    columns_to_scale = [\"mean radius\", \"mean texture\"]\n",
     "    preprocessor = StandardScaler(columns=columns_to_scale)\n",
+    "    train_dataset = preprocessor.fit_transform(train_dataset)\n",
+    "    valid_dataset = preprocessor.transform(valid_dataset)\n",
     "\n",
     "    # XGBoost specific params\n",
     "    params = {\n",
@@ -153,8 +155,8 @@
     "        label_column=\"target\",\n",
     "        params=params,\n",
     "        datasets={\"train\": train_dataset, \"valid\": valid_dataset},\n",
-    "        preprocessor=preprocessor,\n",
     "        num_boost_round=100,\n",
+    "        metadata = {\"preprocessor_pkl\": preprocessor.serialize()}\n",
     "    )\n",
     "    result = trainer.fit()\n",
     "    print(result.metrics)\n",
@@ -190,10 +192,13 @@
     "class Predict:\n",
     "\n",
     "    def __init__(self, checkpoint: Checkpoint):\n",
-    "        self.predictor = XGBoostPredictor.from_checkpoint(checkpoint)\n",
+    "        self.model = XGBoostTrainer.get_model(checkpoint)\n",
+    "        self.preprocessor = Preprocessor.deserialize(checkpoint.get_metadata()[\"preprocessor_pkl\"])\n",
     "\n",
     "    def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:\n",
-    "        return self.predictor.predict(batch)\n",
+    "        preprocessed_batch = self.preprocessor.transform_batch(batch)\n",
+    "        dmatrix = xgboost.DMatrix(preprocessed_batch)\n",
+    "        return {\"predictions\": self.model.predict(dmatrix)}\n",
     "\n",
     "\n",
     "def predict_xgboost(result: Result):\n",
diff --git a/python/ray/data/preprocessor.py b/python/ray/data/preprocessor.py
@@ -1,5 +1,7 @@
 import abc
+import base64
 import collections
+import pickle
 import warnings
 from enum import Enum
 from typing import TYPE_CHECKING, Any, Dict, Optional, Union
@@ -326,3 +328,18 @@ def preferred_batch_format(cls) -> BatchFormat:
         path is the most optimal.
         """
         return BatchFormat.PANDAS
+
+    @DeveloperAPI
+    def serialize(self) -> str:
+        """Return this preprocessor serialized as a string.
+        Note: this is not a stable serialization format as it uses `pickle`.
+        """
+        # Convert it to a plain string so that it can be included as JSON metadata
+        # in Trainer checkpoints.
+        return base64.b64encode(pickle.dumps(self)).decode("ascii")
+
+    @staticmethod
+    @DeveloperAPI
+    def deserialize(serialized: str) -> "Preprocessor":
+        """Load the original preprocessor serialized via `self.serialize()`."""
+        return pickle.loads(base64.b64decode(serialized))
diff --git a/python/ray/data/preprocessors/chain.py b/python/ray/data/preprocessors/chain.py
@@ -1,21 +1,13 @@
-import warnings
 from typing import TYPE_CHECKING, Union
 
 from ray.air.util.data_batch_conversion import BatchFormat
 from ray.data import Dataset, DatasetPipeline
 from ray.data.preprocessor import Preprocessor
-from ray.util.annotations import Deprecated
 
 if TYPE_CHECKING:
     from ray.air.data_batch_type import DataBatchType
 
-CHAIN_DEPRECATION_MESSAGE = (
-    "The Chain preprocessor is deprecated as of Ray 2.7. Instead, manually apply your "
-    "sequence of Preprocessor `fit` and `transform` calls directly on the Ray Dataset."
-)
 
-
-@Deprecated(message=CHAIN_DEPRECATION_MESSAGE)
 class Chain(Preprocessor):
     """Combine multiple preprocessors into a single :py:class:`Preprocessor`.
 
@@ -74,7 +66,6 @@ def fit_status(self):
             return Preprocessor.FitStatus.NOT_FITTABLE
 
     def __init__(self, *preprocessors: Preprocessor):
-        warnings.warn(CHAIN_DEPRECATION_MESSAGE, DeprecationWarning, stacklevel=2)
         self.preprocessors = preprocessors
 
     def _fit(self, ds: Dataset) -> Preprocessor: