From 8d373e77a48d0517957028184937b5e2f914f05b Mon Sep 17 00:00:00 2001
From: Alexey Volkov <alexey.volkov@ark-kun.com>
Date: Sun, 19 Jul 2020 22:08:50 -0700
Subject: [PATCH] Components - XGBoost - Added the
 Train_regression_and_calculate_metrics component (#4243)

---
 .../from_CSV/component.py                     | 56 ++++++++++++++
 .../from_CSV/component.yaml                   | 75 +++++++++++++++++++
 2 files changed, 131 insertions(+)
 create mode 100644 components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.py
 create mode 100644 components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.yaml

diff --git a/components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.py b/components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.py
new file mode 100644
index 00000000000..86d48a0aa9e
--- /dev/null
+++ b/components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.py
@@ -0,0 +1,56 @@
+from collections import OrderedDict
+from kfp import components
+
+
+xgboost_train_on_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml')
+xgboost_predict_on_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Predict/component.yaml')
+pandas_transform_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml')
+drop_header_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/02c9638287468c849632cf9f7885b51de4c66f86/components/tables/Remove_header/component.yaml')
+calculate_regression_metrics_from_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/7da1ac9464b4b3e7d95919faa2f1107a9635b7e4/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml')
+
+
+def xgboost_train_regression_and_calculate_metrics_on_csv(
+    training_data: 'CSV',
+    testing_data: 'CSV',
+    label_column: int = 0,
+    objective: str = 'reg:squarederror',
+    num_iterations: int = 200,
+):
+    model = xgboost_train_on_csv_op(
+        training_data=training_data,
+        label_column=label_column,
+        objective=objective,
+        num_iterations=num_iterations,
+    ).outputs['model']
+
+    predictions = xgboost_predict_on_csv_op(
+        data=testing_data,
+        model=model,
+        label_column=label_column,
+    ).output
+
+    true_values_table = pandas_transform_csv_op(
+        table=testing_data,
+        transform_code='df = df[["tips"]]',
+    ).output
+
+    true_values = drop_header_op(true_values_table).output
+
+    metrics_task = calculate_regression_metrics_from_csv_op(
+        true_values=true_values,
+        predicted_values=predictions,
+    )
+    return OrderedDict([
+        ('model', model),
+        ('mean_absolute_error', metrics_task.outputs['mean_absolute_error']),
+        ('mean_squared_error', metrics_task.outputs['mean_squared_error']),
+        ('root_mean_squared_error', metrics_task.outputs['root_mean_squared_error']),
+        ('metrics', metrics_task.outputs['metrics']),
+    ])
+
+        
+if __name__ == '__main__':
+    xgboost_train_regression_and_calculate_metrics_on_csv_op = components.create_graph_component_from_pipeline_func(
+        xgboost_train_regression_and_calculate_metrics_on_csv,
+        output_component_file='component.yaml',
+    )
diff --git a/components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.yaml b/components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.yaml
new file mode 100644
index 00000000000..23e5c493020
--- /dev/null
+++ b/components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.yaml
@@ -0,0 +1,75 @@
+name: Xgboost train regression and calculate metrics on csv
+inputs:
+- {name: training_data, type: CSV}
+- {name: testing_data, type: CSV}
+- {name: label_column, type: Integer, default: '0', optional: true}
+- {name: objective, type: String, default: 'reg:squarederror', optional: true}
+- {name: num_iterations, type: Integer, default: '200', optional: true}
+outputs:
+- {name: model, type: XGBoostModel}
+- {name: mean_absolute_error, type: Float}
+- {name: mean_squared_error, type: Float}
+- {name: root_mean_squared_error, type: Float}
+- {name: metrics, type: JsonObject}
+implementation:
+  graph:
+    tasks:
+      Xgboost train:
+        componentRef: {digest: 09b80053da29f8f51575b42e5d2e8ad4b7bdcc92a02c3744e189b1f597006b38,
+          url: 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml'}
+        arguments:
+          training_data:
+            graphInput: {inputName: training_data}
+          label_column:
+            graphInput: {inputName: label_column}
+          num_iterations:
+            graphInput: {inputName: num_iterations}
+          objective:
+            graphInput: {inputName: objective}
+      Xgboost predict:
+        componentRef: {digest: ecdfaf32cff15b6abc3d0dd80365ce00577f1a19a058fbe201f515431cea1357,
+          url: 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Predict/component.yaml'}
+        arguments:
+          data:
+            graphInput: {inputName: testing_data}
+          model:
+            taskOutput: {outputName: model, taskId: Xgboost train, type: XGBoostModel}
+          label_column:
+            graphInput: {inputName: label_column}
+      Pandas Transform DataFrame in CSV format:
+        componentRef: {digest: 58dc88349157bf128021708c316ce4eb60bc1de0a5a7dd3af45fabac3276d510,
+          url: 'https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml'}
+        arguments:
+          table:
+            graphInput: {inputName: testing_data}
+          transform_code: df = df[["tips"]]
+      Remove header:
+        componentRef: {digest: ba35ffea863855b956c3c50aefa0420ba3823949a6c059e6e3971cde960dc5a3,
+          url: 'https://raw.githubusercontent.com/kubeflow/pipelines/02c9638287468c849632cf9f7885b51de4c66f86/components/tables/Remove_header/component.yaml'}
+        arguments:
+          table:
+            taskOutput: {outputName: transformed_table, taskId: Pandas Transform DataFrame
+                in CSV format, type: CSV}
+      Calculate regression metrics from csv:
+        componentRef: {digest: e3ecbfeb18032820edfee4255e2fb6d15d15ed224e166519d5e528e12053a995,
+          url: 'https://raw.githubusercontent.com/kubeflow/pipelines/7da1ac9464b4b3e7d95919faa2f1107a9635b7e4/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml'}
+        arguments:
+          true_values:
+            taskOutput: {outputName: table, taskId: Remove header}
+          predicted_values:
+            taskOutput: {outputName: predictions, taskId: Xgboost predict, type: Text}
+    outputValues:
+      model:
+        taskOutput: {outputName: model, taskId: Xgboost train, type: XGBoostModel}
+      mean_absolute_error:
+        taskOutput: {outputName: mean_absolute_error, taskId: Calculate regression
+            metrics from csv, type: Float}
+      mean_squared_error:
+        taskOutput: {outputName: mean_squared_error, taskId: Calculate regression
+            metrics from csv, type: Float}
+      root_mean_squared_error:
+        taskOutput: {outputName: root_mean_squared_error, taskId: Calculate regression
+            metrics from csv, type: Float}
+      metrics:
+        taskOutput: {outputName: metrics, taskId: Calculate regression metrics from
+            csv, type: JsonObject}