Skip to content

Commit 88636df

Browse files
Modified to use dataset and pipelinedata (#193)
* Modified to use dataset and pipelinedata * Create diabetes dataset if no dataset specified * Reverted null build_id check * Added MSE tag * Reverted logic for NULL Build ID * Force new model if previous onehas no metric tag * Removed unused DATA variables * Removed unused DATA vars * Updated workaround for non-metric-tagged models * tidied up code
1 parent e108e07 commit 88636df

File tree

7 files changed

+137
-90
lines changed

7 files changed

+137
-90
lines changed

.env.example

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@ EVALUATE_SCRIPT_PATH = 'evaluate/evaluate_model.py'
3232
REGISTER_SCRIPT_PATH = 'register/register_model.py'
3333
SOURCES_DIR_TRAIN = 'diabetes_regression'
3434
DATASET_NAME = 'diabetes_ds'
35-
DATASTORE_NAME = 'datablobstore'
36-
DATAFILE_NAME = 'diabetes.csv'
3735

3836
# Optional. Used by a training pipeline with R on Databricks
3937
DB_CLUSTER_ID = ''

diabetes_regression/evaluate/evaluate_model.py

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@
9090
help="Name of the Model",
9191
default="sklearn_regression_model.pkl",
9292
)
93+
9394
parser.add_argument(
9495
"--allow_run_cancel",
9596
type=str,
@@ -122,18 +123,10 @@
122123
model_name, tag_name, exp.name, ws)
123124

124125
if (model is not None):
125-
126-
production_model_run_id = model.run_id
127-
128-
# Get the run history for both production model and
129-
# newly trained model and compare mse
130-
production_model_run = Run(exp, run_id=production_model_run_id)
131-
new_model_run = run.parent
132-
print("Production model run is", production_model_run)
133-
134-
production_model_mse = \
135-
production_model_run.get_metrics().get(metric_eval)
136-
new_model_mse = new_model_run.get_metrics().get(metric_eval)
126+
production_model_mse = 10000
127+
if (metric_eval in model.tags):
128+
production_model_mse = float(model.tags[metric_eval])
129+
new_model_mse = float(run.parent.get_metrics().get(metric_eval))
137130
if (production_model_mse is None or new_model_mse is None):
138131
print("Unable to find", metric_eval, "metrics, "
139132
"exiting evaluation")
@@ -151,7 +144,7 @@
151144
print("New trained model performs better, "
152145
"thus it should be registered")
153146
else:
154-
print("New trained model metric is less than or equal to "
147+
print("New trained model metric is worse than or equal to "
155148
"production model so skipping model registration.")
156149
if((allow_run_cancel).lower() == 'true'):
157150
run.parent.cancel()

diabetes_regression/register/register_model.py

Lines changed: 55 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import sys
2828
import argparse
2929
import traceback
30+
import joblib
3031
from azureml.core import Run, Experiment, Workspace
3132
from azureml.core.model import Model as AMLModel
3233

@@ -63,17 +64,24 @@ def main():
6364
type=str,
6465
help="The Build ID of the build triggering this pipeline run",
6566
)
67+
6668
parser.add_argument(
6769
"--run_id",
6870
type=str,
6971
help="Training run ID",
7072
)
73+
7174
parser.add_argument(
7275
"--model_name",
7376
type=str,
7477
help="Name of the Model",
7578
default="sklearn_regression_model.pkl",
7679
)
80+
parser.add_argument(
81+
"--step_input",
82+
type=str,
83+
help=("input from previous steps")
84+
)
7785

7886
args = parser.parse_args()
7987
if (args.build_id is not None):
@@ -83,18 +91,42 @@ def main():
8391
if (run_id == 'amlcompute'):
8492
run_id = run.parent.id
8593
model_name = args.model_name
94+
model_path = args.step_input
8695

87-
if (build_id is None):
88-
register_aml_model(model_name, exp, run_id)
89-
else:
90-
run.tag("BuildId", value=build_id)
91-
builduri_base = os.environ.get("BUILDURI_BASE")
92-
if (builduri_base is not None):
93-
build_uri = builduri_base + build_id
94-
run.tag("BuildUri", value=build_uri)
95-
register_aml_model(model_name, exp, run_id, build_id, build_uri)
96+
# load the model
97+
print("Loading model from " + model_path)
98+
model_file = os.path.join(model_path, model_name)
99+
model = joblib.load(model_file)
100+
model_mse = run.parent.get_metrics()["mse"]
101+
102+
if (model is not None):
103+
if (build_id is None):
104+
register_aml_model(model_file, model_name, exp, run_id)
96105
else:
97-
register_aml_model(model_name, exp, run_id, build_id)
106+
run.tag("BuildId", value=build_id)
107+
builduri_base = os.environ.get("BUILDURI_BASE")
108+
if (builduri_base is not None):
109+
build_uri = builduri_base + build_id
110+
run.tag("BuildUri", value=build_uri)
111+
register_aml_model(
112+
model_file,
113+
model_name,
114+
model_mse,
115+
exp,
116+
run_id,
117+
build_id,
118+
build_uri)
119+
else:
120+
register_aml_model(
121+
model_file,
122+
model_name,
123+
model_mse,
124+
exp,
125+
run_id,
126+
build_id)
127+
else:
128+
print("Model not found. Skipping model registration.")
129+
sys.exit(0)
98130

99131

100132
def model_already_registered(model_name, exp, run_id):
@@ -109,35 +141,30 @@ def model_already_registered(model_name, exp, run_id):
109141

110142

111143
def register_aml_model(
144+
model_path,
112145
model_name,
146+
model_mse,
113147
exp,
114148
run_id,
115149
build_id: str = 'none',
116150
build_uri=None
117151
):
118152
try:
153+
tagsValue = {"area": "diabetes_regression",
154+
"run_id": run_id,
155+
"experiment_name": exp.name,
156+
"mse": model_mse}
119157
if (build_id != 'none'):
120158
model_already_registered(model_name, exp, run_id)
121-
run = Run(experiment=exp, run_id=run_id)
122-
tagsValue = {"area": "diabetes_regression",
123-
"BuildId": build_id, "run_id": run_id,
124-
"experiment_name": exp.name}
159+
tagsValue["BuildId"] = build_id
125160
if (build_uri is not None):
126161
tagsValue["BuildUri"] = build_uri
127-
else:
128-
run = Run(experiment=exp, run_id=run_id)
129-
if (run is not None):
130-
tagsValue = {"area": "diabetes_regression",
131-
"run_id": run_id, "experiment_name": exp.name}
132-
else:
133-
print("A model run for experiment", exp.name,
134-
"matching properties run_id =", run_id,
135-
"was not found. Skipping model registration.")
136-
sys.exit(0)
137-
138-
model = run.register_model(model_name=model_name,
139-
model_path="./outputs/" + model_name,
140-
tags=tagsValue)
162+
163+
model = AMLModel.register(
164+
workspace=exp.workspace,
165+
model_name=model_name,
166+
model_path=model_path,
167+
tags=tagsValue)
141168
os.chdir("..")
142169
print(
143170
"Model registered: {} \nModel Description: {} "

diabetes_regression/training/train.py

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,8 @@
2424
POSSIBILITY OF SUCH DAMAGE.
2525
"""
2626
from azureml.core.run import Run
27-
from azureml.core import Dataset
2827
import os
2928
import argparse
30-
from sklearn.datasets import load_diabetes
3129
from sklearn.linear_model import Ridge
3230
from sklearn.metrics import mean_squared_error
3331
from sklearn.model_selection import train_test_split
@@ -65,19 +63,20 @@ def main():
6563
)
6664

6765
parser.add_argument(
68-
"--dataset_name",
66+
"--step_output",
6967
type=str,
70-
help=("Dataset with the training data")
68+
help=("output for passing data to next step")
7169
)
70+
7271
args = parser.parse_args()
7372

7473
print("Argument [build_id]: %s" % args.build_id)
7574
print("Argument [model_name]: %s" % args.model_name)
76-
print("Argument [dataset_name]: %s" % args.dataset_name)
75+
print("Argument [step_output]: %s" % args.step_output)
7776

7877
model_name = args.model_name
7978
build_id = args.build_id
80-
dataset_name = args.dataset_name
79+
step_output_path = args.step_output
8180

8281
print("Getting training parameters")
8382

@@ -91,15 +90,17 @@ def main():
9190
print("Parameter alpha: %s" % alpha)
9291

9392
run = Run.get_context()
94-
ws = run.experiment.workspace
9593

96-
if (dataset_name):
97-
dataset = Dataset.get_by_name(workspace=ws, name=dataset_name)
94+
# Get the dataset
95+
dataset = run.input_datasets['training_data']
96+
if (dataset):
9897
df = dataset.to_pandas_dataframe()
9998
X = df.values
10099
y = df.Y
101100
else:
102-
X, y = load_diabetes(return_X_y=True)
101+
e = ("No dataset provided")
102+
print(e)
103+
raise Exception(e)
103104

104105
X_train, X_test, y_train, y_test = train_test_split(
105106
X, y, test_size=0.2, random_state=0)
@@ -108,21 +109,18 @@ def main():
108109

109110
reg = train_model(run, data, alpha)
110111

111-
joblib.dump(value=reg, filename=model_name)
112-
113-
# upload model file explicitly into artifacts for parent run
114-
run.parent.upload_file(name="./outputs/" + model_name,
115-
path_or_stream=model_name)
116-
print("Uploaded the model {} to experiment {}".format(
117-
model_name, run.experiment.name))
118-
dirpath = os.getcwd()
119-
print(dirpath)
120-
print("Following files are uploaded ")
121-
print(run.parent.get_file_names())
112+
# Pass model file to next step
113+
os.makedirs(step_output_path, exist_ok=True)
114+
model_output_path = os.path.join(step_output_path, model_name)
115+
joblib.dump(value=reg, filename=model_output_path)
122116

123-
run.parent.tag("BuildId", value=build_id)
117+
# Also upload model file to run outputs for history
118+
os.makedirs('outputs', exist_ok=True)
119+
output_path = os.path.join('outputs', model_name)
120+
joblib.dump(value=reg, filename=output_path)
124121

125122
# Add properties to identify this specific training run
123+
run.parent.tag("BuildId", value=build_id)
126124
run.tag("BuildId", value=build_id)
127125
run.tag("run_type", value="train")
128126
builduri_base = os.environ.get("BUILDURI_BASE")

docs/getting_started.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ the BASE_NAME value should not exceed 10 characters and it should contain number
7575

7676
The **RESOURCE_GROUP** parameter is used as the name for the resource group that will hold the Azure resources for the solution. If providing an existing AML Workspace, set this value to the corresponding resource group name.
7777

78+
The **WORKSPACE_SVC_CONNECTION** parameter is used to reference a service connection for the Azure ML workspace. You will create this after provisioning the workspace (we recommend using the IaC pipeline as described below), and installing the Azure ML extension in your Azure DevOps project.
79+
80+
Optionally, a **DATASET_NAME** parameter can be used to reference a training dataset that you have registered in your Azure ML workspace (more details below).
81+
7882
Make sure to select the **Allow access to all pipelines** checkbox in the
7983
variable group configuration.
8084

@@ -125,8 +129,7 @@ Check out the newly created resources in the [Azure Portal](https://portal.azure
125129

126130
(Optional) To remove the resources created for this project you can use the [/environment_setup/iac-remove-environment.yml](../environment_setup/iac-remove-environment.yml) definition or you can just delete the resource group in the [Azure Portal](https://portal.azure.com).
127131

128-
**Note:** The training ML pipeline uses a [sample diabetes dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html) as training data. If you want to use your own dataset, you need to [create and register a datastore](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-access-data#azure-machine-learning-studio) in your ML workspace and upload the datafile (e.g. [diabetes.csv](./data/diabetes.csv)) to the corresponding blob container. You can also define a datastore in the ML Workspace with [az cli](https://docs.microsoft.com/en-us/cli/azure/ext/azure-cli-ml/ml/datastore?view=azure-cli-latest#ext-azure-cli-ml-az-ml-datastore-attach-blob).
129-
You'll also need to configure DATASTORE_NAME and DATAFILE_NAME variables in ***devopsforai-aml-vg*** variable group.
132+
**Note:** The training ML pipeline uses a [sample diabetes dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html) as training data. To use your own data, you need to [create a Dataset](https://docs.microsoft.com/azure/machine-learning/how-to-create-register-datasets) in your workspace and specify its name in a DATASET_NAME variable in the ***devopsforai-aml-vg*** variable group. You will also need to modify the test cases in the **ml_service/util/smoke_test_scoring_service.py** script to match the schema of the training features in your dataset.
130133

131134
## Create an Azure DevOps Azure ML Workspace Service Connection
132135

ml_service/pipelines/diabetes_regression_build_train_pipeline.py

Lines changed: 51 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
from azureml.pipeline.core.graph import PipelineParameter
22
from azureml.pipeline.steps import PythonScriptStep
3-
from azureml.pipeline.core import Pipeline
3+
from azureml.pipeline.core import Pipeline, PipelineData
44
from azureml.core import Workspace, Environment
55
from azureml.core.runconfig import RunConfiguration
6-
from azureml.core import Dataset, Datastore
6+
from azureml.core import Dataset
77
from ml_service.util.attach_compute import get_compute
88
from ml_service.util.env_variables import Env
9+
from sklearn.datasets import load_diabetes
10+
import pandas as pd
11+
import os
912

1013

1114
def main():
@@ -45,26 +48,59 @@ def main():
4548
build_id_param = PipelineParameter(
4649
name="build_id", default_value=e.build_id)
4750

48-
dataset_name = ""
49-
if (e.datastore_name is not None and e.datafile_name is not None):
50-
dataset_name = e.dataset_name
51-
datastore = Datastore.get(aml_workspace, e.datastore_name)
52-
data_path = [(datastore, e.datafile_name)]
53-
dataset = Dataset.Tabular.from_delimited_files(path=data_path)
54-
dataset.register(workspace=aml_workspace,
55-
name=e.dataset_name,
56-
description="dataset with training data",
57-
create_new_version=True)
51+
# Get dataset name
52+
dataset_name = e.dataset_name
53+
54+
# Check to see if dataset exists
55+
if (dataset_name not in aml_workspace.datasets):
56+
# Create dataset from diabetes sample data
57+
sample_data = load_diabetes()
58+
df = pd.DataFrame(
59+
data=sample_data.data,
60+
columns=sample_data.feature_names)
61+
df['Y'] = sample_data.target
62+
file_name = 'diabetes.csv'
63+
df.to_csv(file_name, index=False)
64+
65+
# Upload file to default datastore in workspace
66+
default_ds = aml_workspace.get_default_datastore()
67+
target_path = 'training-data/'
68+
default_ds.upload_files(
69+
files=[file_name],
70+
target_path=target_path,
71+
overwrite=True,
72+
show_progress=False)
73+
74+
# Register dataset
75+
path_on_datastore = os.path.join(target_path, file_name)
76+
dataset = Dataset.Tabular.from_delimited_files(
77+
path=(default_ds, path_on_datastore))
78+
dataset = dataset.register(
79+
workspace=aml_workspace,
80+
name=dataset_name,
81+
description='diabetes training data',
82+
tags={'format': 'CSV'},
83+
create_new_version=True)
84+
85+
# Get the dataset
86+
dataset = Dataset.get_by_name(aml_workspace, dataset_name)
87+
88+
# Create a PipelineData to pass data between steps
89+
pipeline_data = PipelineData(
90+
'pipeline_data',
91+
datastore=aml_workspace.get_default_datastore())
5892

5993
train_step = PythonScriptStep(
6094
name="Train Model",
6195
script_name=e.train_script_path,
6296
compute_target=aml_compute,
6397
source_directory=e.sources_directory_train,
98+
inputs=[dataset.as_named_input('training_data')],
99+
outputs=[pipeline_data],
64100
arguments=[
65101
"--build_id", build_id_param,
66102
"--model_name", model_name_param,
67-
"--dataset_name", dataset_name,
103+
"--step_output", pipeline_data
68104
],
69105
runconfig=run_config,
70106
allow_reuse=False,
@@ -91,9 +127,11 @@ def main():
91127
script_name=e.register_script_path,
92128
compute_target=aml_compute,
93129
source_directory=e.sources_directory_train,
130+
inputs=[pipeline_data],
94131
arguments=[
95132
"--build_id", build_id_param,
96133
"--model_name", model_name_param,
134+
"--step_input", pipeline_data,
97135
],
98136
runconfig=run_config,
99137
allow_reuse=False,

0 commit comments

Comments
 (0)