From 8d373e77a48d0517957028184937b5e2f914f05b Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Sun, 19 Jul 2020 22:08:50 -0700 Subject: [PATCH] Components - XGBoost - Added the Train_regression_and_calculate_metrics component (#4243) --- .../from_CSV/component.py | 56 ++++++++++++++ .../from_CSV/component.yaml | 75 +++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.py create mode 100644 components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.yaml diff --git a/components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.py b/components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.py new file mode 100644 index 00000000000..86d48a0aa9e --- /dev/null +++ b/components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.py @@ -0,0 +1,56 @@ +from collections import OrderedDict +from kfp import components + + +xgboost_train_on_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml') +xgboost_predict_on_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Predict/component.yaml') +pandas_transform_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml') +drop_header_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/02c9638287468c849632cf9f7885b51de4c66f86/components/tables/Remove_header/component.yaml') +calculate_regression_metrics_from_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/7da1ac9464b4b3e7d95919faa2f1107a9635b7e4/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml') + + +def xgboost_train_regression_and_calculate_metrics_on_csv( + training_data: 'CSV', + testing_data: 'CSV', + label_column: int = 0, + objective: str = 'reg:squarederror', + num_iterations: int = 200, +): + model = xgboost_train_on_csv_op( + training_data=training_data, + label_column=label_column, + objective=objective, + num_iterations=num_iterations, + ).outputs['model'] + + predictions = xgboost_predict_on_csv_op( + data=testing_data, + model=model, + label_column=label_column, + ).output + + true_values_table = pandas_transform_csv_op( + table=testing_data, + transform_code='df = df[["tips"]]', + ).output + + true_values = drop_header_op(true_values_table).output + + metrics_task = calculate_regression_metrics_from_csv_op( + true_values=true_values, + predicted_values=predictions, + ) + return OrderedDict([ + ('model', model), + ('mean_absolute_error', metrics_task.outputs['mean_absolute_error']), + ('mean_squared_error', metrics_task.outputs['mean_squared_error']), + ('root_mean_squared_error', metrics_task.outputs['root_mean_squared_error']), + ('metrics', metrics_task.outputs['metrics']), + ]) + + +if __name__ == '__main__': + xgboost_train_regression_and_calculate_metrics_on_csv_op = components.create_graph_component_from_pipeline_func( + xgboost_train_regression_and_calculate_metrics_on_csv, + output_component_file='component.yaml', + ) diff --git a/components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.yaml b/components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.yaml new file mode 100644 index 00000000000..23e5c493020 --- /dev/null +++ b/components/XGBoost/Train_regression_and_calculate_metrics/from_CSV/component.yaml @@ -0,0 +1,75 @@ +name: Xgboost train regression and calculate metrics on csv +inputs: +- {name: training_data, type: CSV} +- {name: testing_data, type: CSV} +- {name: label_column, type: Integer, default: '0', optional: true} +- {name: objective, type: String, default: 'reg:squarederror', optional: true} +- {name: num_iterations, type: Integer, default: '200', optional: true} +outputs: +- {name: model, type: XGBoostModel} +- {name: mean_absolute_error, type: Float} +- {name: mean_squared_error, type: Float} +- {name: root_mean_squared_error, type: Float} +- {name: metrics, type: JsonObject} +implementation: + graph: + tasks: + Xgboost train: + componentRef: {digest: 09b80053da29f8f51575b42e5d2e8ad4b7bdcc92a02c3744e189b1f597006b38, + url: 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml'} + arguments: + training_data: + graphInput: {inputName: training_data} + label_column: + graphInput: {inputName: label_column} + num_iterations: + graphInput: {inputName: num_iterations} + objective: + graphInput: {inputName: objective} + Xgboost predict: + componentRef: {digest: ecdfaf32cff15b6abc3d0dd80365ce00577f1a19a058fbe201f515431cea1357, + url: 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Predict/component.yaml'} + arguments: + data: + graphInput: {inputName: testing_data} + model: + taskOutput: {outputName: model, taskId: Xgboost train, type: XGBoostModel} + label_column: + graphInput: {inputName: label_column} + Pandas Transform DataFrame in CSV format: + componentRef: {digest: 58dc88349157bf128021708c316ce4eb60bc1de0a5a7dd3af45fabac3276d510, + url: 'https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml'} + arguments: + table: + graphInput: {inputName: testing_data} + transform_code: df = df[["tips"]] + Remove header: + componentRef: {digest: ba35ffea863855b956c3c50aefa0420ba3823949a6c059e6e3971cde960dc5a3, + url: 'https://raw.githubusercontent.com/kubeflow/pipelines/02c9638287468c849632cf9f7885b51de4c66f86/components/tables/Remove_header/component.yaml'} + arguments: + table: + taskOutput: {outputName: transformed_table, taskId: Pandas Transform DataFrame + in CSV format, type: CSV} + Calculate regression metrics from csv: + componentRef: {digest: e3ecbfeb18032820edfee4255e2fb6d15d15ed224e166519d5e528e12053a995, + url: 'https://raw.githubusercontent.com/kubeflow/pipelines/7da1ac9464b4b3e7d95919faa2f1107a9635b7e4/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml'} + arguments: + true_values: + taskOutput: {outputName: table, taskId: Remove header} + predicted_values: + taskOutput: {outputName: predictions, taskId: Xgboost predict, type: Text} + outputValues: + model: + taskOutput: {outputName: model, taskId: Xgboost train, type: XGBoostModel} + mean_absolute_error: + taskOutput: {outputName: mean_absolute_error, taskId: Calculate regression + metrics from csv, type: Float} + mean_squared_error: + taskOutput: {outputName: mean_squared_error, taskId: Calculate regression + metrics from csv, type: Float} + root_mean_squared_error: + taskOutput: {outputName: root_mean_squared_error, taskId: Calculate regression + metrics from csv, type: Float} + metrics: + taskOutput: {outputName: metrics, taskId: Calculate regression metrics from + csv, type: JsonObject}