Components - CatBoost (#3844)

* Components - CatBoost * Train classifier * Predict class * Added CatBoost converter components * Added a sample pipeline * Fixed bug * Fixed the style * Updated the converter components * Added the Train_regression component * Added the Predict_values component * Added the Predict_class_probabilities component * Added the additional_training_options input to the trainers * Changed the output type of Train_classifier to CatBoostModel * Ranmed Predict_class to Predict_classes * Updated the sample pipeline * Moved some components to the from_CSV subdirectories * Updated the sample pipeline * Fixed the sample * Sample - Simplified the training data referencing * Sample - Added training_data_for_classification calculation * Updated a stale component.yaml file * Fixed the input type * FIxed the default loss function for the regression training * Fixed the CSV reading in the predictor components * FIxed the prediction_type * Fixed the class predictions saving * Update sample_pipeline.py * Finalized the sample
kubeflow · Jun 17, 2020 · c809fc8 · c809fc8
1 parent 4f5a7f0
commit c809fc8
Show file tree

Hide file tree

Showing 15 changed files with 1,393 additions and 0 deletions.
diff --git a/components/CatBoost/Predict_class_probabilities/from_CSV/component.py b/components/CatBoost/Predict_class_probabilities/from_CSV/component.py
@@ -0,0 +1,58 @@
+from kfp.components import InputPath, OutputPath, create_component_from_func
+
+def catboost_predict_class_probabilities(
+    data_path: InputPath('CSV'),
+    model_path: InputPath('CatBoostModel'),
+    predictions_path: OutputPath(),
+
+    label_column: int = None,
+):
+    '''Predict class probabilities with a CatBoost model.
+
+    Args:
+        data_path: Path for the data in CSV format.
+        model_path: Path for the trained model in binary CatBoostModel format.
+        label_column: Column containing the label data.
+        predictions_path: Output path for the predictions.
+
+    Outputs:
+        predictions: Predictions in text format.
+
+    Annotations:
+        author: Alexey Volkov <alexey.volkov@ark-kun.com>
+    '''
+    import tempfile
+
+    from catboost import CatBoost, Pool
+    import numpy
+
+    if label_column:
+        column_descriptions = {label_column: 'Label'}
+        column_description_path = tempfile.NamedTemporaryFile(delete=False).name
+        with open(column_description_path, 'w') as column_description_file:
+            for idx, kind in column_descriptions.items():
+                column_description_file.write('{}\t{}\n'.format(idx, kind))
+    else:
+        column_description_path = None
+
+    eval_data = Pool(
+        data_path,
+        column_description=column_description_path,
+        has_header=True,
+        delimiter=',',
+    )
+
+    model = CatBoost()
+    model.load_model(model_path)
+
+    predictions = model.predict(eval_data, prediction_type='Probability')
+    numpy.savetxt(predictions_path, predictions)
+
+
+if __name__ == '__main__':
+    catboost_predict_class_probabilities_op = create_component_from_func(
+        catboost_predict_class_probabilities,
+        output_component_file='component.yaml',
+        base_image='python:3.7',
+        packages_to_install=['catboost==0.23']
+    )
diff --git a/components/CatBoost/Predict_class_probabilities/from_CSV/component.yaml b/components/CatBoost/Predict_class_probabilities/from_CSV/component.yaml
@@ -0,0 +1,108 @@
+name: Catboost predict class probabilities
+description: |-
+  Predict class probabilities with a CatBoost model.
+
+      Args:
+          data_path: Path for the data in CSV format.
+          model_path: Path for the trained model in binary CatBoostModel format.
+          label_column: Column containing the label data.
+          predictions_path: Output path for the predictions.
+
+      Outputs:
+          predictions: Predictions in text format.
+
+      Annotations:
+          author: Alexey Volkov <alexey.volkov@ark-kun.com>
+inputs:
+- {name: data, type: CSV}
+- {name: model, type: CatBoostModel}
+- {name: label_column, type: Integer, optional: true}
+outputs:
+- {name: predictions}
+implementation:
+  container:
+    image: python:3.7
+    command:
+    - sh
+    - -c
+    - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
+      'catboost==0.23' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet
+      --no-warn-script-location 'catboost==0.23' --user) && "$0" "$@"
+    - python3
+    - -u
+    - -c
+    - |
+      def _make_parent_dirs_and_return_path(file_path: str):
+          import os
+          os.makedirs(os.path.dirname(file_path), exist_ok=True)
+          return file_path
+
+      def catboost_predict_class_probabilities(
+          data_path,
+          model_path,
+          predictions_path,
+
+          label_column = None,
+      ):
+          '''Predict class probabilities with a CatBoost model.
+
+          Args:
+              data_path: Path for the data in CSV format.
+              model_path: Path for the trained model in binary CatBoostModel format.
+              label_column: Column containing the label data.
+              predictions_path: Output path for the predictions.
+
+          Outputs:
+              predictions: Predictions in text format.
+
+          Annotations:
+              author: Alexey Volkov <alexey.volkov@ark-kun.com>
+          '''
+          import tempfile
+
+          from catboost import CatBoost, Pool
+          import numpy
+
+          if label_column:
+              column_descriptions = {label_column: 'Label'}
+              column_description_path = tempfile.NamedTemporaryFile(delete=False).name
+              with open(column_description_path, 'w') as column_description_file:
+                  for idx, kind in column_descriptions.items():
+                      column_description_file.write('{}\t{}\n'.format(idx, kind))
+          else:
+              column_description_path = None
+
+          eval_data = Pool(
+              data_path,
+              column_description=column_description_path,
+              has_header=True,
+              delimiter=',',
+          )
+
+          model = CatBoost()
+          model.load_model(model_path)
+
+          predictions = model.predict(eval_data, prediction_type='Probability')
+          numpy.savetxt(predictions_path, predictions)
+
+      import argparse
+      _parser = argparse.ArgumentParser(prog='Catboost predict class probabilities', description='Predict class probabilities with a CatBoost model.\n\n    Args:\n        data_path: Path for the data in CSV format.\n        model_path: Path for the trained model in binary CatBoostModel format.\n        label_column: Column containing the label data.\n        predictions_path: Output path for the predictions.\n\n    Outputs:\n        predictions: Predictions in text format.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
+      _parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS)
+      _parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
+      _parsed_args = vars(_parser.parse_args())
+
+      _outputs = catboost_predict_class_probabilities(**_parsed_args)
+    args:
+    - --data
+    - {inputPath: data}
+    - --model
+    - {inputPath: model}
+    - if:
+        cond: {isPresent: label_column}
+        then:
+        - --label-column
+        - {inputValue: label_column}
+    - --predictions
+    - {outputPath: predictions}
diff --git a/components/CatBoost/Predict_classes/from_CSV/component.py b/components/CatBoost/Predict_classes/from_CSV/component.py
@@ -0,0 +1,58 @@
+from kfp.components import InputPath, OutputPath, create_component_from_func
+
+def catboost_predict_classes(
+    data_path: InputPath('CSV'),
+    model_path: InputPath('CatBoostModel'),
+    predictions_path: OutputPath(),
+
+    label_column: int = None,
+):
+    '''Predict classes using the CatBoost classifier model.
+
+    Args:
+        data_path: Path for the data in CSV format.
+        model_path: Path for the trained model in binary CatBoostModel format.
+        label_column: Column containing the label data.
+        predictions_path: Output path for the predictions.
+
+    Outputs:
+        predictions: Class predictions in text format.
+
+    Annotations:
+        author: Alexey Volkov <alexey.volkov@ark-kun.com>
+    '''
+    import tempfile
+
+    from catboost import CatBoostClassifier, Pool
+    import numpy
+
+    if label_column:
+        column_descriptions = {label_column: 'Label'}
+        column_description_path = tempfile.NamedTemporaryFile(delete=False).name
+        with open(column_description_path, 'w') as column_description_file:
+            for idx, kind in column_descriptions.items():
+                column_description_file.write('{}\t{}\n'.format(idx, kind))
+    else:
+        column_description_path = None
+
+    eval_data = Pool(
+        data_path,
+        column_description=column_description_path,
+        has_header=True,
+        delimiter=',',
+    )
+
+    model = CatBoostClassifier()
+    model.load_model(model_path)
+
+    predictions = model.predict(eval_data)
+    numpy.savetxt(predictions_path, predictions, fmt='%s')
+
+
+if __name__ == '__main__':
+    catboost_predict_classes_op = create_component_from_func(
+        catboost_predict_classes,
+        output_component_file='component.yaml',
+        base_image='python:3.7',
+        packages_to_install=['catboost==0.22']
+    )
diff --git a/components/CatBoost/Predict_classes/from_CSV/component.yaml b/components/CatBoost/Predict_classes/from_CSV/component.yaml
@@ -0,0 +1,108 @@
+name: Catboost predict classes
+description: |-
+  Predict classes using the CatBoost classifier model.
+
+      Args:
+          data_path: Path for the data in CSV format.
+          model_path: Path for the trained model in binary CatBoostModel format.
+          label_column: Column containing the label data.
+          predictions_path: Output path for the predictions.
+
+      Outputs:
+          predictions: Class predictions in text format.
+
+      Annotations:
+          author: Alexey Volkov <alexey.volkov@ark-kun.com>
+inputs:
+- {name: data, type: CSV}
+- {name: model, type: CatBoostModel}
+- {name: label_column, type: Integer, optional: true}
+outputs:
+- {name: predictions}
+implementation:
+  container:
+    image: python:3.7
+    command:
+    - sh
+    - -c
+    - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
+      'catboost==0.22' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet
+      --no-warn-script-location 'catboost==0.22' --user) && "$0" "$@"
+    - python3
+    - -u
+    - -c
+    - |
+      def _make_parent_dirs_and_return_path(file_path: str):
+          import os
+          os.makedirs(os.path.dirname(file_path), exist_ok=True)
+          return file_path
+
+      def catboost_predict_classes(
+          data_path,
+          model_path,
+          predictions_path,
+
+          label_column = None,
+      ):
+          '''Predict classes using the CatBoost classifier model.
+
+          Args:
+              data_path: Path for the data in CSV format.
+              model_path: Path for the trained model in binary CatBoostModel format.
+              label_column: Column containing the label data.
+              predictions_path: Output path for the predictions.
+
+          Outputs:
+              predictions: Class predictions in text format.
+
+          Annotations:
+              author: Alexey Volkov <alexey.volkov@ark-kun.com>
+          '''
+          import tempfile
+
+          from catboost import CatBoostClassifier, Pool
+          import numpy
+
+          if label_column:
+              column_descriptions = {label_column: 'Label'}
+              column_description_path = tempfile.NamedTemporaryFile(delete=False).name
+              with open(column_description_path, 'w') as column_description_file:
+                  for idx, kind in column_descriptions.items():
+                      column_description_file.write('{}\t{}\n'.format(idx, kind))
+          else:
+              column_description_path = None
+
+          eval_data = Pool(
+              data_path,
+              column_description=column_description_path,
+              has_header=True,
+              delimiter=',',
+          )
+
+          model = CatBoostClassifier()
+          model.load_model(model_path)
+
+          predictions = model.predict(eval_data)
+          numpy.savetxt(predictions_path, predictions, fmt='%s')
+
+      import argparse
+      _parser = argparse.ArgumentParser(prog='Catboost predict classes', description='Predict classes using the CatBoost classifier model.\n\n    Args:\n        data_path: Path for the data in CSV format.\n        model_path: Path for the trained model in binary CatBoostModel format.\n        label_column: Column containing the label data.\n        predictions_path: Output path for the predictions.\n\n    Outputs:\n        predictions: Class predictions in text format.\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
+      _parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS)
+      _parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
+      _parsed_args = vars(_parser.parse_args())
+
+      _outputs = catboost_predict_classes(**_parsed_args)
+    args:
+    - --data
+    - {inputPath: data}
+    - --model
+    - {inputPath: model}
+    - if:
+        cond: {isPresent: label_column}
+        then:
+        - --label-column
+        - {inputValue: label_column}
+    - --predictions
+    - {outputPath: predictions}
diff --git a/components/CatBoost/Predict_values/from_CSV/component.py b/components/CatBoost/Predict_values/from_CSV/component.py
@@ -0,0 +1,58 @@
+from kfp.components import InputPath, OutputPath, create_component_from_func
+
+def catboost_predict_values(
+    data_path: InputPath('CSV'),
+    model_path: InputPath('CatBoostModel'),
+    predictions_path: OutputPath(),
+
+    label_column: int = None,
+):
+    '''Predict values with a CatBoost model.
+
+    Args:
+        data_path: Path for the data in CSV format.
+        model_path: Path for the trained model in binary CatBoostModel format.
+        label_column: Column containing the label data.
+        predictions_path: Output path for the predictions.
+
+    Outputs:
+        predictions: Predictions in text format.
+
+    Annotations:
+        author: Alexey Volkov <alexey.volkov@ark-kun.com>
+    '''
+    import tempfile
+
+    from catboost import CatBoost, Pool
+    import numpy
+
+    if label_column:
+        column_descriptions = {label_column: 'Label'}
+        column_description_path = tempfile.NamedTemporaryFile(delete=False).name
+        with open(column_description_path, 'w') as column_description_file:
+            for idx, kind in column_descriptions.items():
+                column_description_file.write('{}\t{}\n'.format(idx, kind))
+    else:
+        column_description_path = None
+
+    eval_data = Pool(
+        data_path,
+        column_description=column_description_path,
+        has_header=True,
+        delimiter=',',
+    )
+
+    model = CatBoost()
+    model.load_model(model_path)
+
+    predictions = model.predict(eval_data, prediction_type='RawFormulaVal')
+    numpy.savetxt(predictions_path, predictions)
+
+
+if __name__ == '__main__':
+    catboost_predict_values_op = create_component_from_func(
+        catboost_predict_values,
+        output_component_file='component.yaml',
+        base_image='python:3.7',
+        packages_to_install=['catboost==0.23']
+    )