Skip to content

Commit 53518be

Browse files
Added notebook to pipeline
1 parent 414d158 commit 53518be

20 files changed

+5792
-0
lines changed

notebook-to-pipeline/.amlignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
2+
## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
3+
4+
.ipynb_aml_checkpoints/
5+
*.amltmp
6+
*.amltemp

notebook-to-pipeline/Code/.amlignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
2+
## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
3+
4+
.ipynb_aml_checkpoints/
5+
*.amltmp
6+
*.amltemp
7+
.amlignore

notebook-to-pipeline/Code/analysis.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import pandas as pd
2+
import numpy as np
3+
import os
4+
from pathlib import Path
5+
from pyod.models.pca import PCA
6+
from pyod.models.lof import LOF
7+
from pyod.models.abod import ABOD
8+
import plotly
9+
import plotly.figure_factory as ff
10+
import seaborn as sns
11+
import matplotlib.pyplot as plt
12+
import plotly.express as px
13+
import argparse
14+
15+
# Environment setup needed to create the
16+
# DockerContext/requirements.txt file
17+
# %pip install openpyxl
18+
# %pip install pyod
19+
# %pip install -U kaleido
20+
# You can find specific version of packages
21+
# using the following command
22+
# %pip freeze | grep openpyxl
23+
24+
parser = argparse.ArgumentParser()
25+
parser.add_argument(
26+
"--company-code", type=str, dest="company_code", help="Company code. E.g. CompanyA"
27+
)
28+
parser.add_argument(
29+
"--input-folder",
30+
type=str,
31+
dest="input_folder",
32+
help="The folder that contains the data",
33+
)
34+
parser.add_argument(
35+
"--output-folder",
36+
type=str,
37+
dest="output_folder",
38+
help="The folder to store results",
39+
)
40+
parser.add_argument(
41+
"--input-file",
42+
type=str,
43+
dest="input_file",
44+
help="The name of the Excel containing the data",
45+
default="Records.xlsx",
46+
)
47+
parser.add_argument(
48+
"--showgraph", type=bool, dest="showgraph", help="Display graphs", default=False
49+
)
50+
51+
args = parser.parse_args()
52+
53+
excel_file_path = os.path.join(args.input_folder, args.company_code, args.input_file)
54+
print(excel_file_path)
55+
56+
df = pd.read_excel(excel_file_path, engine="openpyxl")
57+
print(df.head())
58+
59+
60+
df_no_date = df.drop(columns="Date", inplace=False)
61+
62+
# Copy some code from https://github.com/yzhao062/pyod/blob/master/notebooks/Compare%20All%20Models.ipynb
63+
models = {
64+
"pca": PCA(contamination=0.1, n_components=3),
65+
"lof": LOF(contamination=0.1),
66+
"abod": ABOD(contamination=0.1),
67+
}
68+
69+
for i, (clf_name, clf) in enumerate(models.items()):
70+
print(i + 1, "fitting", clf_name)
71+
clf.fit(df_no_date)
72+
outliers = clf.predict(df_no_date)
73+
df[clf_name] = outliers
74+
75+
outputs_folder = os.path.join(args.output_folder, args.company_code)
76+
# Ensure that output folder exist
77+
Path(outputs_folder).mkdir(parents=True, exist_ok=True)
78+
79+
df.to_excel(os.path.join(outputs_folder, "outlier_records.xlsx"))
80+
81+
# Create feature correlations plot
82+
features = [k for k in df_no_date.columns]
83+
84+
fig = ff.create_annotated_heatmap(
85+
np.array(df_no_date.corr().round(2)),
86+
colorscale="Viridis",
87+
x=features,
88+
y=features,
89+
hoverongaps=True,
90+
)
91+
92+
fig.update_layout(
93+
paper_bgcolor="white",
94+
width=1200,
95+
height=1200,
96+
titlefont=dict(size=25),
97+
title_text="Features correlation plot",
98+
)
99+
fig.update_xaxes(tickangle=90, side="bottom")
100+
101+
fig.write_image(os.path.join(outputs_folder, "correlation_plot.png"))
102+
plotly.offline.plot(fig, filename=os.path.join(outputs_folder, "correlation_plot.html"))
103+
104+
# Let's print a pairplot
105+
# https://doobzncoobz.com/seaborn-pairplot/
106+
plt.figure()
107+
sns_plot = sns.pairplot(df_no_date)
108+
sns_plot.fig.set_size_inches(15, 15)
109+
sns_plot.fig.suptitle("Pair plot", y=1.01, size=30)
110+
111+
sns_plot.savefig(os.path.join(outputs_folder, "sns_pairplot.png"))
112+
113+
outlier_column = list(models.keys())[0]
114+
115+
fig = px.scatter_3d(data_frame=df, x="ft01", y="ft02", z="ft03", symbol=outlier_column)
116+
fig.update_layout(
117+
margin=dict(l=30, r=30, b=30, t=30),
118+
autosize=False,
119+
width=1000,
120+
height=1000,
121+
showlegend=False,
122+
title={
123+
"text": f"Outlier Plot ({outlier_column})",
124+
"y": 0.91,
125+
"x": 0.5,
126+
"xanchor": "center",
127+
"yanchor": "top",
128+
},
129+
)
130+
131+
# circle's are outliers, diamonds are normal entries for the specific model
132+
# Let's change the outliers to red X and the rest into green circles
133+
for i, d in enumerate(fig.data):
134+
if fig.data[i].marker.symbol == "circle":
135+
fig.data[i].marker.symbol = "x"
136+
fig.data[i].marker.color = "red"
137+
else:
138+
fig.data[i].marker.symbol = "circle"
139+
fig.data[i].marker.color = "green"
140+
141+
if args.showgraph:
142+
fig.show()
143+
144+
fig.write_image(os.path.join(outputs_folder, "outlier_plot.png"))
145+
plotly.offline.plot(fig, filename=os.path.join(outputs_folder, "outlier_plot.html"))
Binary file not shown.
Binary file not shown.

notebook-to-pipeline/CompanyA.xlsx

13.1 KB
Binary file not shown.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
.amlignore
2+
.ipynb_aml_checkpoints/
3+
*.amltmp
4+
*.amltemp
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
FROM python:3.8
2+
3+
# python installs
4+
COPY requirements.txt .
5+
RUN pip install -r requirements.txt
6+
7+
# set command
8+
CMD ["bash"]
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
openpyxl==3.0.10
2+
pyod==1.0.4
3+
kaleido==0.2.1
4+
plotly==5.10.0
5+
pandas==1.1.5
6+
numpy==1.21.6
7+
seaborn==0.11.2
8+
pathlib2==2.3.7.post1
9+
matplotlib==3.2.1
10+
argparse
84.7 KB
Loading
7.32 KB
Loading

notebook-to-pipeline/Readme.md

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Converting a notebook into a repeatable AzureML pipeline (cli v2)
2+
3+
This folder contains the sample code used in the following video:
4+
5+
[![Notebook to AzureML pipeline](https://img.youtube.com/vi/4QPqbLf57iU/0.jpg)](https://youtu.be/4QPqbLf57iU)
6+
7+
## Curate the code
8+
9+
Let's assume that you start from [a notebook](./original_notebook.ipynb) that does an analysis reading [an Excel file](./CompanyA.xlsx).
10+
The goal is to make this notebook generic, so that it can read the excel files from a folder structure like the following:
11+
12+
```bash
13+
Companies
14+
CompanyA
15+
Records.xlsx
16+
CompanyB
17+
Records.xlsx
18+
```
19+
20+
After [modifying the notebook code](./final_notebook.ipynb), you can export [a python script](./Code/analysis.py) which you can execute using the following command:
21+
22+
```bash
23+
cd Code
24+
python analysis.py --company-code CompanyA --input-folder ../Companies --output-folder ../analysis_results
25+
```
26+
27+
The outputs of the analysis are stored in the `analysis_results` folder under a folder named `CompanyA`, as seen below:
28+
29+
![Results tree view](./Readme.ResultsTreeView.png)
30+
31+
## Create the pipeline
32+
33+
To create a pipeline, you need to create [an environment](./environment.yml) which contains all the [software dependencies](./DockerContext/requirements.txt) of your code. You register the environment using the following command:
34+
35+
```dotnetcli
36+
az ml environment create -f environment.yml
37+
```
38+
39+
> Note that in the video the file was with a capital E, e.g. Environment.yml
40+
41+
You can then execute a job using:
42+
43+
```dotnetcli
44+
az ml job create -f company_analysis_pipeline.yml
45+
```
46+
47+
The job above will execute only once and then reuse the cached outputs if you execute it a second time. If you want to force the re-execution of the company analysis (e.g. if you have an Azure Data Factory pipeline that copies a new excel file over the previous one) you can move the command code [into a separate component](./companyAnalysisComponent.yml) use the `is_deterministic: false` attribute in the yaml file. Then you can execute a new job using the updated file:
48+
49+
```dotnetcli
50+
az ml job create -f company_analysis_pipeline_with_component.yml
51+
```
52+
53+
You can publish a job as a pipeline through the UI:
54+
55+
![Publish a pipeline](./Readme.PublishPipeline.png)
56+
57+
## Additional topics addressed in video
58+
59+
You can install and run the linter and file formatter using the following code:
60+
61+
```dotnetcli
62+
pip install black[jupyter] flake8 flake8_nb
63+
black .
64+
flake8 .
65+
flake8_nb .
66+
```
67+
68+
> See the [setup.cfg](./setup.cfg) file for configuration options.
69+
70+
Before using the `az` command for the first time, you will need to login and optionally set the active subscription (if you have more than one). Here are the commands shown in the video:
71+
72+
```dotnetcli
73+
az --version
74+
az login
75+
az account show
76+
az account list
77+
az account set --subscription ab05...ab05
78+
az account show
79+
```
80+
81+
## References
82+
83+
You can read more:
84+
85+
- https://learn.microsoft.com/azure/machine-learning/how-to-create-component-pipelines-cli
86+
- https://learn.microsoft.com/azure/machine-learning/tutorial-pipeline-python-sdk
87+
- https://github.com/Azure/azureml-examples/tree/main/sdk/python/assets/environment
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
2+
type: command
3+
4+
is_deterministic: false
5+
name: company_analysis
6+
display_name: Analysis of a company
7+
code: ./Code
8+
command: python analysis.py --company-code ${{inputs.company_code_from_component}} --input-folder ${{inputs.input_folder}} --output-folder ${{outputs.output_folder_from_component}}
9+
environment: azureml:analysis-environment-py38@latest
10+
inputs:
11+
company_code_from_component:
12+
type: string
13+
input_folder:
14+
type: path
15+
outputs:
16+
output_folder_from_component:
17+
type: path
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
2+
type: pipeline
3+
display_name: company_analysis
4+
experiment_name: Notebook2Pipeline
5+
compute: azureml:cpu-cluster
6+
7+
inputs:
8+
company_code: "CompanyA"
9+
input_folder:
10+
type: uri_folder
11+
path: azureml://datastores/workspaceblobstore/paths/Companies/
12+
outputs:
13+
output_folder:
14+
type: uri_folder
15+
path: azureml://datastores/workspaceblobstore/paths/Outputs/
16+
jobs:
17+
analysis:
18+
code: ./Code
19+
command: python analysis.py --company-code ${{inputs.company_code}} --input-folder ${{inputs.input_folder}} --output-folder ${{outputs.output_folder}}
20+
environment: azureml:analysis-environment@latest
21+
inputs:
22+
company_code: ${{parent.inputs.company_code}}
23+
input_folder: ${{parent.inputs.input_folder}}
24+
outputs:
25+
output_folder: ${{parent.outputs.output_folder}}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
2+
type: pipeline
3+
4+
display_name: company_analysis
5+
description: Pipeline to analyze a company
6+
experiment_name: Notebook2Pipeline
7+
8+
compute: azureml:cpu-cluster
9+
10+
inputs:
11+
company_code: "CompanyA"
12+
13+
outputs:
14+
output_folder:
15+
type: uri_folder
16+
mode: rw_mount
17+
path: azureml://datastores/workspaceblobstore/paths/Outputs/
18+
19+
jobs:
20+
analysis:
21+
type: command
22+
component: ./companyAnalysisComponent.yml
23+
inputs:
24+
company_code_from_component: ${{parent.inputs.company_code}}
25+
input_folder:
26+
type: uri_folder
27+
path: azureml://datastores/workspaceblobstore/paths/Companies/
28+
outputs:
29+
output_folder_from_component: ${{parent.outputs.output_folder}}

notebook-to-pipeline/environment.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
2+
name: analysis-environment
3+
build:
4+
path: DockerContext

0 commit comments

Comments
 (0)