Components - XGBoost - Train and Predict from Apache Parquet (#4035)

* Components - XGBoost - Train and Predict from Apache Parquet * Updated the sample pipeline
kubeflow · Jun 23, 2020 · c7ef9b4 · c7ef9b4
1 parent c4340f6
commit c7ef9b4
Show file tree

Hide file tree

Showing 5 changed files with 488 additions and 11 deletions.
diff --git a/components/XGBoost/Predict/from_ApacheParquet/component.py b/components/XGBoost/Predict/from_ApacheParquet/component.py
@@ -0,0 +1,54 @@
+from kfp.components import InputPath, OutputPath, create_component_from_func
+
+def xgboost_predict(
+    data_path: InputPath('ApacheParquet'),
+    model_path: InputPath('XGBoostModel'),
+    predictions_path: OutputPath('Text'),
+    label_column_name: str = None,
+):
+    '''Make predictions using a trained XGBoost model.
+
+    Args:
+        data_path: Path for the feature data in Apache Parquet format.
+        model_path: Path for the trained model in binary XGBoost format.
+        predictions_path: Output path for the predictions.
+        label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.
+
+    Annotations:
+        author: Alexey Volkov <alexey.volkov@ark-kun.com>
+    '''
+    from pathlib import Path
+
+    import numpy
+    import pandas
+    import xgboost
+
+    # Loading data
+    df = pandas.read_parquet(data_path)
+    if label_column_name:
+        df = df.drop(columns=[label_column_name])
+
+    evaluation_data = xgboost.DMatrix(
+        data=df,
+    )
+
+    # Training
+    model = xgboost.Booster(model_file=model_path)
+
+    predictions = model.predict(evaluation_data)
+
+    Path(predictions_path).parent.mkdir(parents=True, exist_ok=True)
+    numpy.savetxt(predictions_path, predictions)
+
+
+if __name__ == '__main__':
+    create_component_from_func(
+        xgboost_predict,
+        output_component_file='component.yaml',
+        base_image='python:3.7',
+        packages_to_install=[
+            'xgboost==1.1.1',
+            'pandas==1.0.5',
+            'pyarrow==0.17.1',
+        ]
+    )
diff --git a/components/XGBoost/Predict/from_ApacheParquet/component.yaml b/components/XGBoost/Predict/from_ApacheParquet/component.yaml
@@ -0,0 +1,98 @@
+name: Xgboost predict
+description: |-
+  Make predictions using a trained XGBoost model.
+
+      Args:
+          data_path: Path for the feature data in Apache Parquet format.
+          model_path: Path for the trained model in binary XGBoost format.
+          predictions_path: Output path for the predictions.
+          label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.
+
+      Annotations:
+          author: Alexey Volkov <alexey.volkov@ark-kun.com>
+inputs:
+- {name: data, type: ApacheParquet}
+- {name: model, type: XGBoostModel}
+- {name: label_column_name, type: String, optional: true}
+outputs:
+- {name: predictions, type: Text}
+implementation:
+  container:
+    image: python:3.7
+    command:
+    - sh
+    - -c
+    - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
+      'xgboost==1.1.1' 'pandas==1.0.5' 'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1
+      python3 -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5'
+      'pyarrow==0.17.1' --user) && "$0" "$@"
+    - python3
+    - -u
+    - -c
+    - |
+      def _make_parent_dirs_and_return_path(file_path: str):
+          import os
+          os.makedirs(os.path.dirname(file_path), exist_ok=True)
+          return file_path
+
+      def xgboost_predict(
+          data_path,
+          model_path,
+          predictions_path,
+          label_column_name = None,
+      ):
+          '''Make predictions using a trained XGBoost model.
+
+          Args:
+              data_path: Path for the feature data in Apache Parquet format.
+              model_path: Path for the trained model in binary XGBoost format.
+              predictions_path: Output path for the predictions.
+              label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.
+
+          Annotations:
+              author: Alexey Volkov <alexey.volkov@ark-kun.com>
+          '''
+          from pathlib import Path
+
+          import numpy
+          import pandas
+          import xgboost
+
+          # Loading data
+          df = pandas.read_parquet(data_path)
+          if label_column_name:
+              df = df.drop(columns=[label_column_name])
+
+          evaluation_data = xgboost.DMatrix(
+              data=df,
+          )
+
+          # Training
+          model = xgboost.Booster(model_file=model_path)
+
+          predictions = model.predict(evaluation_data)
+
+          Path(predictions_path).parent.mkdir(parents=True, exist_ok=True)
+          numpy.savetxt(predictions_path, predictions)
+
+      import argparse
+      _parser = argparse.ArgumentParser(prog='Xgboost predict', description='Make predictions using a trained XGBoost model.\n\n    Args:\n        data_path: Path for the feature data in Apache Parquet format.\n        model_path: Path for the trained model in binary XGBoost format.\n        predictions_path: Output path for the predictions.\n        label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
+      _parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--label-column-name", dest="label_column_name", type=str, required=False, default=argparse.SUPPRESS)
+      _parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
+      _parsed_args = vars(_parser.parse_args())
+
+      _outputs = xgboost_predict(**_parsed_args)
+    args:
+    - --data
+    - {inputPath: data}
+    - --model
+    - {inputPath: model}
+    - if:
+        cond: {isPresent: label_column_name}
+        then:
+        - --label-column-name
+        - {inputValue: label_column_name}
+    - --predictions
+    - {outputPath: predictions}
diff --git a/components/XGBoost/Train/from_ApacheParquet/component.py b/components/XGBoost/Train/from_ApacheParquet/component.py
@@ -0,0 +1,90 @@
+from kfp.components import InputPath, OutputPath, create_component_from_func
+
+def xgboost_train(
+    training_data_path: InputPath('ApacheParquet'),
+    model_path: OutputPath('XGBoostModel'),
+    model_config_path: OutputPath('XGBoostModelConfig'),
+    label_column_name: str,
+
+    starting_model_path: InputPath('XGBoostModel') = None,
+
+    num_iterations: int = 10,
+    booster_params: dict = None,
+
+    # Booster parameters
+    objective: str = 'reg:squarederror',
+    booster: str = 'gbtree',
+    learning_rate: float = 0.3,
+    min_split_loss: float = 0,
+    max_depth: int = 6,
+):
+    '''Train an XGBoost model.
+
+    Args:
+        training_data_path: Path for the training data in Apache Parquet format.
+        model_path: Output path for the trained model in binary XGBoost format.
+        model_config_path: Output path for the internal parameter configuration of Booster as a JSON string.
+        starting_model_path: Path for the existing trained model to start from.
+        label_column_name: Name of the column containing the label data.
+        num_boost_rounds: Number of boosting iterations.
+        booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html
+        objective: The learning task and the corresponding learning objective.
+            See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
+            The most common values are:
+            "reg:squarederror" - Regression with squared loss (default).
+            "reg:logistic" - Logistic regression.
+            "binary:logistic" - Logistic regression for binary classification, output probability.
+            "binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation
+            "rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
+            "rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized
+
+    Annotations:
+        author: Alexey Volkov <alexey.volkov@ark-kun.com>
+    '''
+    import pandas
+    import xgboost
+
+    # Loading data
+    df = pandas.read_parquet(training_data_path)
+    training_data = xgboost.DMatrix(
+        data=df.drop(columns=[label_column_name]),
+        label=df[[label_column_name]],
+    )
+    # Training
+    booster_params = booster_params or {}
+    booster_params.setdefault('objective', objective)
+    booster_params.setdefault('booster', booster)
+    booster_params.setdefault('learning_rate', learning_rate)
+    booster_params.setdefault('min_split_loss', min_split_loss)
+    booster_params.setdefault('max_depth', max_depth)
+
+    starting_model = None
+    if starting_model_path:
+        starting_model = xgboost.Booster(model_file=starting_model_path)
+
+    model = xgboost.train(
+        params=booster_params,
+        dtrain=training_data,
+        num_boost_round=num_iterations,
+        xgb_model=starting_model
+    )
+
+    # Saving the model in binary format
+    model.save_model(model_path)
+
+    model_config_str = model.save_config()
+    with open(model_config_path, 'w') as model_config_file:
+        model_config_file.write(model_config_str)
+
+
+if __name__ == '__main__':
+    create_component_from_func(
+        xgboost_train,
+        output_component_file='component.yaml',
+        base_image='python:3.7',
+        packages_to_install=[
+            'xgboost==1.1.1',
+            'pandas==1.0.5',
+            'pyarrow==0.17.1',
+        ]
+    )