Skip to content

Commit

Permalink
Components - CatBoost (#3844)
Browse files Browse the repository at this point in the history
* Components - CatBoost

* Train classifier
* Predict class

* Added CatBoost converter components

* Added a sample pipeline

* Fixed bug

* Fixed the style

* Updated the converter components

* Added the Train_regression component

* Added the Predict_values component

* Added the Predict_class_probabilities component

* Added the additional_training_options input to the trainers

* Changed the output type of Train_classifier to CatBoostModel

* Ranmed Predict_class to Predict_classes

* Updated the sample pipeline

* Moved some components to the from_CSV subdirectories

* Updated the sample pipeline

* Fixed the sample

* Sample - Simplified the training data referencing

* Sample - Added training_data_for_classification calculation

* Updated a stale component.yaml file

* Fixed the input type

* FIxed the default loss function for the regression training

* Fixed the CSV reading in the predictor components

* FIxed the prediction_type

* Fixed the class predictions saving

* Update sample_pipeline.py

* Finalized the sample
  • Loading branch information
Ark-kun committed Jun 17, 2020
1 parent 4f5a7f0 commit c809fc8
Show file tree
Hide file tree
Showing 15 changed files with 1,393 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from kfp.components import InputPath, OutputPath, create_component_from_func

def catboost_predict_class_probabilities(
data_path: InputPath('CSV'),
model_path: InputPath('CatBoostModel'),
predictions_path: OutputPath(),

label_column: int = None,
):
'''Predict class probabilities with a CatBoost model.
Args:
data_path: Path for the data in CSV format.
model_path: Path for the trained model in binary CatBoostModel format.
label_column: Column containing the label data.
predictions_path: Output path for the predictions.
Outputs:
predictions: Predictions in text format.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
'''
import tempfile

from catboost import CatBoost, Pool
import numpy

if label_column:
column_descriptions = {label_column: 'Label'}
column_description_path = tempfile.NamedTemporaryFile(delete=False).name
with open(column_description_path, 'w') as column_description_file:
for idx, kind in column_descriptions.items():
column_description_file.write('{}\t{}\n'.format(idx, kind))
else:
column_description_path = None

eval_data = Pool(
data_path,
column_description=column_description_path,
has_header=True,
delimiter=',',
)

model = CatBoost()
model.load_model(model_path)

predictions = model.predict(eval_data, prediction_type='Probability')
numpy.savetxt(predictions_path, predictions)


if __name__ == '__main__':
catboost_predict_class_probabilities_op = create_component_from_func(
catboost_predict_class_probabilities,
output_component_file='component.yaml',
base_image='python:3.7',
packages_to_install=['catboost==0.23']
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
name: Catboost predict class probabilities
description: |-
Predict class probabilities with a CatBoost model.
Args:
data_path: Path for the data in CSV format.
model_path: Path for the trained model in binary CatBoostModel format.
label_column: Column containing the label data.
predictions_path: Output path for the predictions.
Outputs:
predictions: Predictions in text format.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
inputs:
- {name: data, type: CSV}
- {name: model, type: CatBoostModel}
- {name: label_column, type: Integer, optional: true}
outputs:
- {name: predictions}
implementation:
container:
image: python:3.7
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'catboost==0.23' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet
--no-warn-script-location 'catboost==0.23' --user) && "$0" "$@"
- python3
- -u
- -c
- |
def _make_parent_dirs_and_return_path(file_path: str):
import os
os.makedirs(os.path.dirname(file_path), exist_ok=True)
return file_path
def catboost_predict_class_probabilities(
data_path,
model_path,
predictions_path,
label_column = None,
):
'''Predict class probabilities with a CatBoost model.
Args:
data_path: Path for the data in CSV format.
model_path: Path for the trained model in binary CatBoostModel format.
label_column: Column containing the label data.
predictions_path: Output path for the predictions.
Outputs:
predictions: Predictions in text format.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
'''
import tempfile
from catboost import CatBoost, Pool
import numpy
if label_column:
column_descriptions = {label_column: 'Label'}
column_description_path = tempfile.NamedTemporaryFile(delete=False).name
with open(column_description_path, 'w') as column_description_file:
for idx, kind in column_descriptions.items():
column_description_file.write('{}\t{}\n'.format(idx, kind))
else:
column_description_path = None
eval_data = Pool(
data_path,
column_description=column_description_path,
has_header=True,
delimiter=',',
)
model = CatBoost()
model.load_model(model_path)
predictions = model.predict(eval_data, prediction_type='Probability')
numpy.savetxt(predictions_path, predictions)
import argparse
_parser = argparse.ArgumentParser(prog='Catboost predict class probabilities', description='Predict class probabilities with a CatBoost model.\n\n Args:\n data_path: Path for the data in CSV format.\n model_path: Path for the trained model in binary CatBoostModel format.\n label_column: Column containing the label data.\n predictions_path: Output path for the predictions.\n\n Outputs:\n predictions: Predictions in text format.\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>')
_parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
_parsed_args = vars(_parser.parse_args())
_outputs = catboost_predict_class_probabilities(**_parsed_args)
args:
- --data
- {inputPath: data}
- --model
- {inputPath: model}
- if:
cond: {isPresent: label_column}
then:
- --label-column
- {inputValue: label_column}
- --predictions
- {outputPath: predictions}
58 changes: 58 additions & 0 deletions components/CatBoost/Predict_classes/from_CSV/component.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from kfp.components import InputPath, OutputPath, create_component_from_func

def catboost_predict_classes(
data_path: InputPath('CSV'),
model_path: InputPath('CatBoostModel'),
predictions_path: OutputPath(),

label_column: int = None,
):
'''Predict classes using the CatBoost classifier model.
Args:
data_path: Path for the data in CSV format.
model_path: Path for the trained model in binary CatBoostModel format.
label_column: Column containing the label data.
predictions_path: Output path for the predictions.
Outputs:
predictions: Class predictions in text format.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
'''
import tempfile

from catboost import CatBoostClassifier, Pool
import numpy

if label_column:
column_descriptions = {label_column: 'Label'}
column_description_path = tempfile.NamedTemporaryFile(delete=False).name
with open(column_description_path, 'w') as column_description_file:
for idx, kind in column_descriptions.items():
column_description_file.write('{}\t{}\n'.format(idx, kind))
else:
column_description_path = None

eval_data = Pool(
data_path,
column_description=column_description_path,
has_header=True,
delimiter=',',
)

model = CatBoostClassifier()
model.load_model(model_path)

predictions = model.predict(eval_data)
numpy.savetxt(predictions_path, predictions, fmt='%s')


if __name__ == '__main__':
catboost_predict_classes_op = create_component_from_func(
catboost_predict_classes,
output_component_file='component.yaml',
base_image='python:3.7',
packages_to_install=['catboost==0.22']
)
108 changes: 108 additions & 0 deletions components/CatBoost/Predict_classes/from_CSV/component.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
name: Catboost predict classes
description: |-
Predict classes using the CatBoost classifier model.
Args:
data_path: Path for the data in CSV format.
model_path: Path for the trained model in binary CatBoostModel format.
label_column: Column containing the label data.
predictions_path: Output path for the predictions.
Outputs:
predictions: Class predictions in text format.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
inputs:
- {name: data, type: CSV}
- {name: model, type: CatBoostModel}
- {name: label_column, type: Integer, optional: true}
outputs:
- {name: predictions}
implementation:
container:
image: python:3.7
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'catboost==0.22' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet
--no-warn-script-location 'catboost==0.22' --user) && "$0" "$@"
- python3
- -u
- -c
- |
def _make_parent_dirs_and_return_path(file_path: str):
import os
os.makedirs(os.path.dirname(file_path), exist_ok=True)
return file_path
def catboost_predict_classes(
data_path,
model_path,
predictions_path,
label_column = None,
):
'''Predict classes using the CatBoost classifier model.
Args:
data_path: Path for the data in CSV format.
model_path: Path for the trained model in binary CatBoostModel format.
label_column: Column containing the label data.
predictions_path: Output path for the predictions.
Outputs:
predictions: Class predictions in text format.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
'''
import tempfile
from catboost import CatBoostClassifier, Pool
import numpy
if label_column:
column_descriptions = {label_column: 'Label'}
column_description_path = tempfile.NamedTemporaryFile(delete=False).name
with open(column_description_path, 'w') as column_description_file:
for idx, kind in column_descriptions.items():
column_description_file.write('{}\t{}\n'.format(idx, kind))
else:
column_description_path = None
eval_data = Pool(
data_path,
column_description=column_description_path,
has_header=True,
delimiter=',',
)
model = CatBoostClassifier()
model.load_model(model_path)
predictions = model.predict(eval_data)
numpy.savetxt(predictions_path, predictions, fmt='%s')
import argparse
_parser = argparse.ArgumentParser(prog='Catboost predict classes', description='Predict classes using the CatBoost classifier model.\n\n Args:\n data_path: Path for the data in CSV format.\n model_path: Path for the trained model in binary CatBoostModel format.\n label_column: Column containing the label data.\n predictions_path: Output path for the predictions.\n\n Outputs:\n predictions: Class predictions in text format.\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>')
_parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
_parsed_args = vars(_parser.parse_args())
_outputs = catboost_predict_classes(**_parsed_args)
args:
- --data
- {inputPath: data}
- --model
- {inputPath: model}
- if:
cond: {isPresent: label_column}
then:
- --label-column
- {inputValue: label_column}
- --predictions
- {outputPath: predictions}
58 changes: 58 additions & 0 deletions components/CatBoost/Predict_values/from_CSV/component.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from kfp.components import InputPath, OutputPath, create_component_from_func

def catboost_predict_values(
data_path: InputPath('CSV'),
model_path: InputPath('CatBoostModel'),
predictions_path: OutputPath(),

label_column: int = None,
):
'''Predict values with a CatBoost model.
Args:
data_path: Path for the data in CSV format.
model_path: Path for the trained model in binary CatBoostModel format.
label_column: Column containing the label data.
predictions_path: Output path for the predictions.
Outputs:
predictions: Predictions in text format.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
'''
import tempfile

from catboost import CatBoost, Pool
import numpy

if label_column:
column_descriptions = {label_column: 'Label'}
column_description_path = tempfile.NamedTemporaryFile(delete=False).name
with open(column_description_path, 'w') as column_description_file:
for idx, kind in column_descriptions.items():
column_description_file.write('{}\t{}\n'.format(idx, kind))
else:
column_description_path = None

eval_data = Pool(
data_path,
column_description=column_description_path,
has_header=True,
delimiter=',',
)

model = CatBoost()
model.load_model(model_path)

predictions = model.predict(eval_data, prediction_type='RawFormulaVal')
numpy.savetxt(predictions_path, predictions)


if __name__ == '__main__':
catboost_predict_values_op = create_component_from_func(
catboost_predict_values,
output_component_file='component.yaml',
base_image='python:3.7',
packages_to_install=['catboost==0.23']
)
Loading

0 comments on commit c809fc8

Please sign in to comment.