Skip to content

Commit

Permalink
Components - XGBoost - Train and Predict from Apache Parquet (#4035)
Browse files Browse the repository at this point in the history
* Components - XGBoost - Train and Predict from Apache Parquet

* Updated the sample pipeline
  • Loading branch information
Ark-kun authored Jun 23, 2020
1 parent c4340f6 commit c7ef9b4
Show file tree
Hide file tree
Showing 5 changed files with 488 additions and 11 deletions.
54 changes: 54 additions & 0 deletions components/XGBoost/Predict/from_ApacheParquet/component.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from kfp.components import InputPath, OutputPath, create_component_from_func

def xgboost_predict(
data_path: InputPath('ApacheParquet'),
model_path: InputPath('XGBoostModel'),
predictions_path: OutputPath('Text'),
label_column_name: str = None,
):
'''Make predictions using a trained XGBoost model.
Args:
data_path: Path for the feature data in Apache Parquet format.
model_path: Path for the trained model in binary XGBoost format.
predictions_path: Output path for the predictions.
label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
'''
from pathlib import Path

import numpy
import pandas
import xgboost

# Loading data
df = pandas.read_parquet(data_path)
if label_column_name:
df = df.drop(columns=[label_column_name])

evaluation_data = xgboost.DMatrix(
data=df,
)

# Training
model = xgboost.Booster(model_file=model_path)

predictions = model.predict(evaluation_data)

Path(predictions_path).parent.mkdir(parents=True, exist_ok=True)
numpy.savetxt(predictions_path, predictions)


if __name__ == '__main__':
create_component_from_func(
xgboost_predict,
output_component_file='component.yaml',
base_image='python:3.7',
packages_to_install=[
'xgboost==1.1.1',
'pandas==1.0.5',
'pyarrow==0.17.1',
]
)
98 changes: 98 additions & 0 deletions components/XGBoost/Predict/from_ApacheParquet/component.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
name: Xgboost predict
description: |-
Make predictions using a trained XGBoost model.
Args:
data_path: Path for the feature data in Apache Parquet format.
model_path: Path for the trained model in binary XGBoost format.
predictions_path: Output path for the predictions.
label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
inputs:
- {name: data, type: ApacheParquet}
- {name: model, type: XGBoostModel}
- {name: label_column_name, type: String, optional: true}
outputs:
- {name: predictions, type: Text}
implementation:
container:
image: python:3.7
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'xgboost==1.1.1' 'pandas==1.0.5' 'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1
python3 -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5'
'pyarrow==0.17.1' --user) && "$0" "$@"
- python3
- -u
- -c
- |
def _make_parent_dirs_and_return_path(file_path: str):
import os
os.makedirs(os.path.dirname(file_path), exist_ok=True)
return file_path
def xgboost_predict(
data_path,
model_path,
predictions_path,
label_column_name = None,
):
'''Make predictions using a trained XGBoost model.
Args:
data_path: Path for the feature data in Apache Parquet format.
model_path: Path for the trained model in binary XGBoost format.
predictions_path: Output path for the predictions.
label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
'''
from pathlib import Path
import numpy
import pandas
import xgboost
# Loading data
df = pandas.read_parquet(data_path)
if label_column_name:
df = df.drop(columns=[label_column_name])
evaluation_data = xgboost.DMatrix(
data=df,
)
# Training
model = xgboost.Booster(model_file=model_path)
predictions = model.predict(evaluation_data)
Path(predictions_path).parent.mkdir(parents=True, exist_ok=True)
numpy.savetxt(predictions_path, predictions)
import argparse
_parser = argparse.ArgumentParser(prog='Xgboost predict', description='Make predictions using a trained XGBoost model.\n\n Args:\n data_path: Path for the feature data in Apache Parquet format.\n model_path: Path for the trained model in binary XGBoost format.\n predictions_path: Output path for the predictions.\n label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>')
_parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--label-column-name", dest="label_column_name", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
_parsed_args = vars(_parser.parse_args())
_outputs = xgboost_predict(**_parsed_args)
args:
- --data
- {inputPath: data}
- --model
- {inputPath: model}
- if:
cond: {isPresent: label_column_name}
then:
- --label-column-name
- {inputValue: label_column_name}
- --predictions
- {outputPath: predictions}
90 changes: 90 additions & 0 deletions components/XGBoost/Train/from_ApacheParquet/component.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from kfp.components import InputPath, OutputPath, create_component_from_func

def xgboost_train(
training_data_path: InputPath('ApacheParquet'),
model_path: OutputPath('XGBoostModel'),
model_config_path: OutputPath('XGBoostModelConfig'),
label_column_name: str,

starting_model_path: InputPath('XGBoostModel') = None,

num_iterations: int = 10,
booster_params: dict = None,

# Booster parameters
objective: str = 'reg:squarederror',
booster: str = 'gbtree',
learning_rate: float = 0.3,
min_split_loss: float = 0,
max_depth: int = 6,
):
'''Train an XGBoost model.
Args:
training_data_path: Path for the training data in Apache Parquet format.
model_path: Output path for the trained model in binary XGBoost format.
model_config_path: Output path for the internal parameter configuration of Booster as a JSON string.
starting_model_path: Path for the existing trained model to start from.
label_column_name: Name of the column containing the label data.
num_boost_rounds: Number of boosting iterations.
booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html
objective: The learning task and the corresponding learning objective.
See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
The most common values are:
"reg:squarederror" - Regression with squared loss (default).
"reg:logistic" - Logistic regression.
"binary:logistic" - Logistic regression for binary classification, output probability.
"binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation
"rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
"rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
'''
import pandas
import xgboost

# Loading data
df = pandas.read_parquet(training_data_path)
training_data = xgboost.DMatrix(
data=df.drop(columns=[label_column_name]),
label=df[[label_column_name]],
)
# Training
booster_params = booster_params or {}
booster_params.setdefault('objective', objective)
booster_params.setdefault('booster', booster)
booster_params.setdefault('learning_rate', learning_rate)
booster_params.setdefault('min_split_loss', min_split_loss)
booster_params.setdefault('max_depth', max_depth)

starting_model = None
if starting_model_path:
starting_model = xgboost.Booster(model_file=starting_model_path)

model = xgboost.train(
params=booster_params,
dtrain=training_data,
num_boost_round=num_iterations,
xgb_model=starting_model
)

# Saving the model in binary format
model.save_model(model_path)

model_config_str = model.save_config()
with open(model_config_path, 'w') as model_config_file:
model_config_file.write(model_config_str)


if __name__ == '__main__':
create_component_from_func(
xgboost_train,
output_component_file='component.yaml',
base_image='python:3.7',
packages_to_install=[
'xgboost==1.1.1',
'pandas==1.0.5',
'pyarrow==0.17.1',
]
)
Loading

0 comments on commit c7ef9b4

Please sign in to comment.