Skip to content

Commit

Permalink
Components - XGBoost - Added the Cross_validation_for_regression comp…
Browse files Browse the repository at this point in the history
…onent (#4244)
  • Loading branch information
Ark-kun committed Jul 20, 2020
1 parent 0c65238 commit 34cb59d
Show file tree
Hide file tree
Showing 2 changed files with 339 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from collections import OrderedDict
from kfp import components


split_table_into_folds_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/e9b4b29b22a5120daf95b581b0392cd461a906f0/components/dataset_manipulation/split_data_into_folds/in_CSV/component.yaml')
xgboost_train_on_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml')
xgboost_predict_on_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Predict/component.yaml')
pandas_transform_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml')
drop_header_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/02c9638287468c849632cf9f7885b51de4c66f86/components/tables/Remove_header/component.yaml')
calculate_regression_metrics_from_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/7da1ac9464b4b3e7d95919faa2f1107a9635b7e4/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml')
aggregate_regression_metrics_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/7ea9363fe201918d419fecdc00d1275e657ff712/components/ml_metrics/Aggregate_regression_metrics/component.yaml')


def xgboost_5_fold_cross_validation_for_regression(
data: 'CSV',
label_column: int = 0,
objective: str = 'reg:squarederror',
num_iterations: int = 200,
):
folds = split_table_into_folds_op(data).outputs

fold_metrics = {}
for i in range(1, 6):
training_data = folds['train_' + str(i)]
testing_data = folds['test_' + str(i)]
model = xgboost_train_on_csv_op(
training_data=training_data,
label_column=label_column,
objective=objective,
num_iterations=num_iterations,
).outputs['model']

predictions = xgboost_predict_on_csv_op(
data=testing_data,
model=model,
label_column=label_column,
).output

true_values_table = pandas_transform_csv_op(
table=testing_data,
transform_code='df = df[["tips"]]',
).output

true_values = drop_header_op(true_values_table).output

metrics = calculate_regression_metrics_from_csv_op(
true_values=true_values,
predicted_values=predictions,
).outputs['metrics']

fold_metrics['metrics_' + str(i)] = metrics

aggregated_metrics_task = aggregate_regression_metrics_op(**fold_metrics)

return OrderedDict([
('mean_absolute_error', aggregated_metrics_task.outputs['mean_absolute_error']),
('mean_squared_error', aggregated_metrics_task.outputs['mean_squared_error']),
('root_mean_squared_error', aggregated_metrics_task.outputs['root_mean_squared_error']),
('metrics', aggregated_metrics_task.outputs['metrics']),
])


if __name__ == '__main__':
xgboost_5_fold_cross_validation_for_regression_op = components.create_graph_component_from_pipeline_func(
xgboost_5_fold_cross_validation_for_regression,
output_component_file='component.yaml',
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
name: Xgboost 5 fold cross validation for regression
inputs:
- {name: data, type: CSV}
- {name: label_column, type: Integer, default: '0', optional: true}
- {name: objective, type: String, default: 'reg:squarederror', optional: true}
- {name: num_iterations, type: Integer, default: '200', optional: true}
outputs:
- {name: mean_absolute_error, type: Float}
- {name: mean_squared_error, type: Float}
- {name: root_mean_squared_error, type: Float}
- {name: metrics, type: JsonObject}
implementation:
graph:
tasks:
Split table into folds:
componentRef: {digest: 9956223bcecc7294ca1afac39b60ada4a935a571d817c3dfbf2ea4a211afe3d1,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/e9b4b29b22a5120daf95b581b0392cd461a906f0/components/dataset_manipulation/split_data_into_folds/in_CSV/component.yaml'}
arguments:
table:
graphInput: {inputName: data}
Xgboost train:
componentRef: {digest: 09b80053da29f8f51575b42e5d2e8ad4b7bdcc92a02c3744e189b1f597006b38,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml'}
arguments:
training_data:
taskOutput: {outputName: train_1, taskId: Split table into folds, type: CSV}
label_column:
graphInput: {inputName: label_column}
num_iterations:
graphInput: {inputName: num_iterations}
objective:
graphInput: {inputName: objective}
Xgboost predict:
componentRef: {digest: ecdfaf32cff15b6abc3d0dd80365ce00577f1a19a058fbe201f515431cea1357,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Predict/component.yaml'}
arguments:
data:
taskOutput: {outputName: test_1, taskId: Split table into folds, type: CSV}
model:
taskOutput: {outputName: model, taskId: Xgboost train, type: XGBoostModel}
label_column:
graphInput: {inputName: label_column}
Pandas Transform DataFrame in CSV format:
componentRef: {digest: 58dc88349157bf128021708c316ce4eb60bc1de0a5a7dd3af45fabac3276d510,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml'}
arguments:
table:
taskOutput: {outputName: test_1, taskId: Split table into folds, type: CSV}
transform_code: df = df[["tips"]]
Remove header:
componentRef: {digest: ba35ffea863855b956c3c50aefa0420ba3823949a6c059e6e3971cde960dc5a3,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/02c9638287468c849632cf9f7885b51de4c66f86/components/tables/Remove_header/component.yaml'}
arguments:
table:
taskOutput: {outputName: transformed_table, taskId: Pandas Transform DataFrame
in CSV format, type: CSV}
Calculate regression metrics from csv:
componentRef: {digest: e3ecbfeb18032820edfee4255e2fb6d15d15ed224e166519d5e528e12053a995,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/7da1ac9464b4b3e7d95919faa2f1107a9635b7e4/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml'}
arguments:
true_values:
taskOutput: {outputName: table, taskId: Remove header}
predicted_values:
taskOutput: {outputName: predictions, taskId: Xgboost predict, type: Text}
Xgboost train 2:
componentRef: {digest: 09b80053da29f8f51575b42e5d2e8ad4b7bdcc92a02c3744e189b1f597006b38,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml'}
arguments:
training_data:
taskOutput: {outputName: train_2, taskId: Split table into folds, type: CSV}
label_column:
graphInput: {inputName: label_column}
num_iterations:
graphInput: {inputName: num_iterations}
objective:
graphInput: {inputName: objective}
Xgboost predict 2:
componentRef: {digest: ecdfaf32cff15b6abc3d0dd80365ce00577f1a19a058fbe201f515431cea1357,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Predict/component.yaml'}
arguments:
data:
taskOutput: {outputName: test_2, taskId: Split table into folds, type: CSV}
model:
taskOutput: {outputName: model, taskId: Xgboost train 2, type: XGBoostModel}
label_column:
graphInput: {inputName: label_column}
Pandas Transform DataFrame in CSV format 2:
componentRef: {digest: 58dc88349157bf128021708c316ce4eb60bc1de0a5a7dd3af45fabac3276d510,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml'}
arguments:
table:
taskOutput: {outputName: test_2, taskId: Split table into folds, type: CSV}
transform_code: df = df[["tips"]]
Remove header 2:
componentRef: {digest: ba35ffea863855b956c3c50aefa0420ba3823949a6c059e6e3971cde960dc5a3,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/02c9638287468c849632cf9f7885b51de4c66f86/components/tables/Remove_header/component.yaml'}
arguments:
table:
taskOutput: {outputName: transformed_table, taskId: Pandas Transform DataFrame
in CSV format 2, type: CSV}
Calculate regression metrics from csv 2:
componentRef: {digest: e3ecbfeb18032820edfee4255e2fb6d15d15ed224e166519d5e528e12053a995,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/7da1ac9464b4b3e7d95919faa2f1107a9635b7e4/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml'}
arguments:
true_values:
taskOutput: {outputName: table, taskId: Remove header 2}
predicted_values:
taskOutput: {outputName: predictions, taskId: Xgboost predict 2, type: Text}
Xgboost train 3:
componentRef: {digest: 09b80053da29f8f51575b42e5d2e8ad4b7bdcc92a02c3744e189b1f597006b38,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml'}
arguments:
training_data:
taskOutput: {outputName: train_3, taskId: Split table into folds, type: CSV}
label_column:
graphInput: {inputName: label_column}
num_iterations:
graphInput: {inputName: num_iterations}
objective:
graphInput: {inputName: objective}
Xgboost predict 3:
componentRef: {digest: ecdfaf32cff15b6abc3d0dd80365ce00577f1a19a058fbe201f515431cea1357,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Predict/component.yaml'}
arguments:
data:
taskOutput: {outputName: test_3, taskId: Split table into folds, type: CSV}
model:
taskOutput: {outputName: model, taskId: Xgboost train 3, type: XGBoostModel}
label_column:
graphInput: {inputName: label_column}
Pandas Transform DataFrame in CSV format 3:
componentRef: {digest: 58dc88349157bf128021708c316ce4eb60bc1de0a5a7dd3af45fabac3276d510,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml'}
arguments:
table:
taskOutput: {outputName: test_3, taskId: Split table into folds, type: CSV}
transform_code: df = df[["tips"]]
Remove header 3:
componentRef: {digest: ba35ffea863855b956c3c50aefa0420ba3823949a6c059e6e3971cde960dc5a3,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/02c9638287468c849632cf9f7885b51de4c66f86/components/tables/Remove_header/component.yaml'}
arguments:
table:
taskOutput: {outputName: transformed_table, taskId: Pandas Transform DataFrame
in CSV format 3, type: CSV}
Calculate regression metrics from csv 3:
componentRef: {digest: e3ecbfeb18032820edfee4255e2fb6d15d15ed224e166519d5e528e12053a995,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/7da1ac9464b4b3e7d95919faa2f1107a9635b7e4/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml'}
arguments:
true_values:
taskOutput: {outputName: table, taskId: Remove header 3}
predicted_values:
taskOutput: {outputName: predictions, taskId: Xgboost predict 3, type: Text}
Xgboost train 4:
componentRef: {digest: 09b80053da29f8f51575b42e5d2e8ad4b7bdcc92a02c3744e189b1f597006b38,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml'}
arguments:
training_data:
taskOutput: {outputName: train_4, taskId: Split table into folds, type: CSV}
label_column:
graphInput: {inputName: label_column}
num_iterations:
graphInput: {inputName: num_iterations}
objective:
graphInput: {inputName: objective}
Xgboost predict 4:
componentRef: {digest: ecdfaf32cff15b6abc3d0dd80365ce00577f1a19a058fbe201f515431cea1357,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Predict/component.yaml'}
arguments:
data:
taskOutput: {outputName: test_4, taskId: Split table into folds, type: CSV}
model:
taskOutput: {outputName: model, taskId: Xgboost train 4, type: XGBoostModel}
label_column:
graphInput: {inputName: label_column}
Pandas Transform DataFrame in CSV format 4:
componentRef: {digest: 58dc88349157bf128021708c316ce4eb60bc1de0a5a7dd3af45fabac3276d510,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml'}
arguments:
table:
taskOutput: {outputName: test_4, taskId: Split table into folds, type: CSV}
transform_code: df = df[["tips"]]
Remove header 4:
componentRef: {digest: ba35ffea863855b956c3c50aefa0420ba3823949a6c059e6e3971cde960dc5a3,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/02c9638287468c849632cf9f7885b51de4c66f86/components/tables/Remove_header/component.yaml'}
arguments:
table:
taskOutput: {outputName: transformed_table, taskId: Pandas Transform DataFrame
in CSV format 4, type: CSV}
Calculate regression metrics from csv 4:
componentRef: {digest: e3ecbfeb18032820edfee4255e2fb6d15d15ed224e166519d5e528e12053a995,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/7da1ac9464b4b3e7d95919faa2f1107a9635b7e4/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml'}
arguments:
true_values:
taskOutput: {outputName: table, taskId: Remove header 4}
predicted_values:
taskOutput: {outputName: predictions, taskId: Xgboost predict 4, type: Text}
Xgboost train 5:
componentRef: {digest: 09b80053da29f8f51575b42e5d2e8ad4b7bdcc92a02c3744e189b1f597006b38,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml'}
arguments:
training_data:
taskOutput: {outputName: train_5, taskId: Split table into folds, type: CSV}
label_column:
graphInput: {inputName: label_column}
num_iterations:
graphInput: {inputName: num_iterations}
objective:
graphInput: {inputName: objective}
Xgboost predict 5:
componentRef: {digest: ecdfaf32cff15b6abc3d0dd80365ce00577f1a19a058fbe201f515431cea1357,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Predict/component.yaml'}
arguments:
data:
taskOutput: {outputName: test_5, taskId: Split table into folds, type: CSV}
model:
taskOutput: {outputName: model, taskId: Xgboost train 5, type: XGBoostModel}
label_column:
graphInput: {inputName: label_column}
Pandas Transform DataFrame in CSV format 5:
componentRef: {digest: 58dc88349157bf128021708c316ce4eb60bc1de0a5a7dd3af45fabac3276d510,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml'}
arguments:
table:
taskOutput: {outputName: test_5, taskId: Split table into folds, type: CSV}
transform_code: df = df[["tips"]]
Remove header 5:
componentRef: {digest: ba35ffea863855b956c3c50aefa0420ba3823949a6c059e6e3971cde960dc5a3,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/02c9638287468c849632cf9f7885b51de4c66f86/components/tables/Remove_header/component.yaml'}
arguments:
table:
taskOutput: {outputName: transformed_table, taskId: Pandas Transform DataFrame
in CSV format 5, type: CSV}
Calculate regression metrics from csv 5:
componentRef: {digest: e3ecbfeb18032820edfee4255e2fb6d15d15ed224e166519d5e528e12053a995,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/7da1ac9464b4b3e7d95919faa2f1107a9635b7e4/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml'}
arguments:
true_values:
taskOutput: {outputName: table, taskId: Remove header 5}
predicted_values:
taskOutput: {outputName: predictions, taskId: Xgboost predict 5, type: Text}
Aggregate regression metrics from csv:
componentRef: {digest: 3e128130521eff8d43764f3dcb037316cdd6490ad2878df5adef416f7c2f3c19,
url: 'https://raw.githubusercontent.com/kubeflow/pipelines/7ea9363fe201918d419fecdc00d1275e657ff712/components/ml_metrics/Aggregate_regression_metrics/component.yaml'}
arguments:
metrics_1:
taskOutput: {outputName: metrics, taskId: Calculate regression metrics
from csv, type: JsonObject}
metrics_2:
taskOutput: {outputName: metrics, taskId: Calculate regression metrics
from csv 2, type: JsonObject}
metrics_3:
taskOutput: {outputName: metrics, taskId: Calculate regression metrics
from csv 3, type: JsonObject}
metrics_4:
taskOutput: {outputName: metrics, taskId: Calculate regression metrics
from csv 4, type: JsonObject}
metrics_5:
taskOutput: {outputName: metrics, taskId: Calculate regression metrics
from csv 5, type: JsonObject}
outputValues:
mean_absolute_error:
taskOutput: {outputName: mean_absolute_error, taskId: Aggregate regression
metrics from csv, type: Float}
mean_squared_error:
taskOutput: {outputName: mean_squared_error, taskId: Aggregate regression
metrics from csv, type: Float}
root_mean_squared_error:
taskOutput: {outputName: root_mean_squared_error, taskId: Aggregate regression
metrics from csv, type: Float}
metrics:
taskOutput: {outputName: metrics, taskId: Aggregate regression metrics from
csv, type: JsonObject}

0 comments on commit 34cb59d

Please sign in to comment.