rndazurescript
diff --git a/‎notebook-to-pipeline/.amlignore
Lines changed: 6 additions & 0 deletions b/‎notebook-to-pipeline/.amlignore
Lines changed: 6 additions & 0 deletions
diff --git a/‎notebook-to-pipeline/Code/.amlignore
Lines changed: 7 additions & 0 deletions b/‎notebook-to-pipeline/Code/.amlignore
Lines changed: 7 additions & 0 deletions
diff --git a/‎notebook-to-pipeline/Code/analysis.py
Lines changed: 145 additions & 0 deletions b/‎notebook-to-pipeline/Code/analysis.py
Lines changed: 145 additions & 0 deletions
diff --git a/‎notebook-to-pipeline/Companies/CompanyA/Records.xlsx
13.1 KB b/‎notebook-to-pipeline/Companies/CompanyA/Records.xlsx
13.1 KB
diff --git a/‎notebook-to-pipeline/Companies/CompanyB/Records.xlsx
14.1 KB b/‎notebook-to-pipeline/Companies/CompanyB/Records.xlsx
14.1 KB
diff --git a/‎notebook-to-pipeline/CompanyA.xlsx
13.1 KB b/‎notebook-to-pipeline/CompanyA.xlsx
13.1 KB
diff --git a/‎notebook-to-pipeline/DockerContext/.dockerignore
Lines changed: 4 additions & 0 deletions b/‎notebook-to-pipeline/DockerContext/.dockerignore
Lines changed: 4 additions & 0 deletions
diff --git a/‎notebook-to-pipeline/DockerContext/Dockerfile
Lines changed: 8 additions & 0 deletions b/‎notebook-to-pipeline/DockerContext/Dockerfile
Lines changed: 8 additions & 0 deletions
diff --git a/‎notebook-to-pipeline/DockerContext/requirements.txt
Lines changed: 10 additions & 0 deletions b/‎notebook-to-pipeline/DockerContext/requirements.txt
Lines changed: 10 additions & 0 deletions
diff --git a/‎notebook-to-pipeline/Readme.PublishPipeline.png
84.7 KB b/‎notebook-to-pipeline/Readme.PublishPipeline.png
84.7 KB
diff --git a/‎notebook-to-pipeline/Readme.ResultsTreeView.png
7.32 KB b/‎notebook-to-pipeline/Readme.ResultsTreeView.png
7.32 KB
diff --git a/‎notebook-to-pipeline/Readme.md
Lines changed: 87 additions & 0 deletions b/‎notebook-to-pipeline/Readme.md
Lines changed: 87 additions & 0 deletions
diff --git a/‎notebook-to-pipeline/companyAnalysisComponent.yml
Lines changed: 17 additions & 0 deletions b/‎notebook-to-pipeline/companyAnalysisComponent.yml
Lines changed: 17 additions & 0 deletions
diff --git a/‎notebook-to-pipeline/company_analysis_pipeline.yml
Lines changed: 25 additions & 0 deletions b/‎notebook-to-pipeline/company_analysis_pipeline.yml
Lines changed: 25 additions & 0 deletions
diff --git a/‎notebook-to-pipeline/company_analysis_pipeline_with_component.yml
Lines changed: 29 additions & 0 deletions b/‎notebook-to-pipeline/company_analysis_pipeline_with_component.yml
Lines changed: 29 additions & 0 deletions
diff --git a/‎notebook-to-pipeline/environment.yml
Lines changed: 4 additions & 0 deletions b/‎notebook-to-pipeline/environment.yml
Lines changed: 4 additions & 0 deletions
@@ -0,0 +1,6 @@
+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. 
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+        
+.ipynb_aml_checkpoints/ 
+*.amltmp 
+*.amltemp
@@ -0,0 +1,7 @@
+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. 
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+        
+.ipynb_aml_checkpoints/ 
+*.amltmp 
+*.amltemp
+.amlignore
@@ -0,0 +1,145 @@
+import pandas as pd
+import numpy as np
+import os
+from pathlib import Path
+from pyod.models.pca import PCA
+from pyod.models.lof import LOF
+from pyod.models.abod import ABOD
+import plotly
+import plotly.figure_factory as ff
+import seaborn as sns
+import matplotlib.pyplot as plt
+import plotly.express as px
+import argparse
+
+# Environment setup needed to create the
+# DockerContext/requirements.txt file
+# %pip install openpyxl
+# %pip install pyod
+# %pip install -U kaleido
+# You can find specific version of packages
+# using the following command
+# %pip freeze | grep openpyxl
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--company-code", type=str, dest="company_code", help="Company code. E.g. CompanyA"
+)
+parser.add_argument(
+    "--input-folder",
+    type=str,
+    dest="input_folder",
+    help="The folder that contains the data",
+)
+parser.add_argument(
+    "--output-folder",
+    type=str,
+    dest="output_folder",
+    help="The folder to store results",
+)
+parser.add_argument(
+    "--input-file",
+    type=str,
+    dest="input_file",
+    help="The name of the Excel containing the data",
+    default="Records.xlsx",
+)
+parser.add_argument(
+    "--showgraph", type=bool, dest="showgraph", help="Display graphs", default=False
+)
+
+args = parser.parse_args()
+
+excel_file_path = os.path.join(args.input_folder, args.company_code, args.input_file)
+print(excel_file_path)
+
+df = pd.read_excel(excel_file_path, engine="openpyxl")
+print(df.head())
+
+
+df_no_date = df.drop(columns="Date", inplace=False)
+
+# Copy some code from https://github.com/yzhao062/pyod/blob/master/notebooks/Compare%20All%20Models.ipynb
+models = {
+    "pca": PCA(contamination=0.1, n_components=3),
+    "lof": LOF(contamination=0.1),
+    "abod": ABOD(contamination=0.1),
+}
+
+for i, (clf_name, clf) in enumerate(models.items()):
+    print(i + 1, "fitting", clf_name)
+    clf.fit(df_no_date)
+    outliers = clf.predict(df_no_date)
+    df[clf_name] = outliers
+
+outputs_folder = os.path.join(args.output_folder, args.company_code)
+# Ensure that output folder exist
+Path(outputs_folder).mkdir(parents=True, exist_ok=True)
+
+df.to_excel(os.path.join(outputs_folder, "outlier_records.xlsx"))
+
+# Create feature correlations plot
+features = [k for k in df_no_date.columns]
+
+fig = ff.create_annotated_heatmap(
+    np.array(df_no_date.corr().round(2)),
+    colorscale="Viridis",
+    x=features,
+    y=features,
+    hoverongaps=True,
+)
+
+fig.update_layout(
+    paper_bgcolor="white",
+    width=1200,
+    height=1200,
+    titlefont=dict(size=25),
+    title_text="Features correlation plot",
+)
+fig.update_xaxes(tickangle=90, side="bottom")
+
+fig.write_image(os.path.join(outputs_folder, "correlation_plot.png"))
+plotly.offline.plot(fig, filename=os.path.join(outputs_folder, "correlation_plot.html"))
+
+# Let's print a pairplot
+# https://doobzncoobz.com/seaborn-pairplot/
+plt.figure()
+sns_plot = sns.pairplot(df_no_date)
+sns_plot.fig.set_size_inches(15, 15)
+sns_plot.fig.suptitle("Pair plot", y=1.01, size=30)
+
+sns_plot.savefig(os.path.join(outputs_folder, "sns_pairplot.png"))
+
+outlier_column = list(models.keys())[0]
+
+fig = px.scatter_3d(data_frame=df, x="ft01", y="ft02", z="ft03", symbol=outlier_column)
+fig.update_layout(
+    margin=dict(l=30, r=30, b=30, t=30),
+    autosize=False,
+    width=1000,
+    height=1000,
+    showlegend=False,
+    title={
+        "text": f"Outlier Plot ({outlier_column})",
+        "y": 0.91,
+        "x": 0.5,
+        "xanchor": "center",
+        "yanchor": "top",
+    },
+)
+
+# circle's are outliers, diamonds are normal entries for the specific model
+# Let's change the outliers to red X and the rest into green circles
+for i, d in enumerate(fig.data):
+    if fig.data[i].marker.symbol == "circle":
+        fig.data[i].marker.symbol = "x"
+        fig.data[i].marker.color = "red"
+    else:
+        fig.data[i].marker.symbol = "circle"
+        fig.data[i].marker.color = "green"
+
+if args.showgraph:
+    fig.show()
+
+fig.write_image(os.path.join(outputs_folder, "outlier_plot.png"))
+plotly.offline.plot(fig, filename=os.path.join(outputs_folder, "outlier_plot.html"))
@@ -0,0 +1,4 @@
+.amlignore
+.ipynb_aml_checkpoints/ 
+*.amltmp 
+*.amltemp
@@ -0,0 +1,8 @@
+FROM python:3.8
+
+# python installs
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+# set command
+CMD ["bash"]
@@ -0,0 +1,10 @@
+openpyxl==3.0.10
+pyod==1.0.4
+kaleido==0.2.1
+plotly==5.10.0
+pandas==1.1.5
+numpy==1.21.6
+seaborn==0.11.2
+pathlib2==2.3.7.post1
+matplotlib==3.2.1
+argparse
@@ -0,0 +1,87 @@
+# Converting a notebook into a repeatable AzureML pipeline (cli v2)
+
+This folder contains the sample code used in the following video:
+
+[![Notebook to AzureML pipeline](https://img.youtube.com/vi/4QPqbLf57iU/0.jpg)](https://youtu.be/4QPqbLf57iU)
+
+## Curate the code
+
+Let's assume that you start from [a notebook](./original_notebook.ipynb) that does an analysis reading [an Excel file](./CompanyA.xlsx).
+The goal is to make this notebook generic, so that it can read the excel files from a folder structure like the following:
+
+```bash
+Companies
+  CompanyA
+    Records.xlsx
+  CompanyB
+    Records.xlsx
+```
+
+After [modifying the notebook code](./final_notebook.ipynb), you can export [a python script](./Code/analysis.py) which you can execute using the following command:
+
+```bash
+cd Code
+python analysis.py --company-code CompanyA --input-folder ../Companies --output-folder ../analysis_results
+```
+
+The outputs of the analysis are stored in the `analysis_results` folder under a folder named `CompanyA`, as seen below:
+
+![Results tree view](./Readme.ResultsTreeView.png)
+
+## Create the pipeline
+
+To create a pipeline, you need to create [an environment](./environment.yml) which contains all the [software dependencies](./DockerContext/requirements.txt) of your code. You register the environment using the following command:
+
+```dotnetcli
+az ml environment create -f environment.yml
+```
+
+> Note that in the video the file was with a capital E, e.g. Environment.yml
+
+You can then execute a job using:
+
+```dotnetcli
+az ml job create -f company_analysis_pipeline.yml 
+```
+
+The job above will execute only once and then reuse the cached outputs if you execute it a second time. If you want to force the re-execution of the company analysis (e.g. if you have an Azure Data Factory pipeline that copies a new excel file over the previous one) you can move the command code [into a separate component](./companyAnalysisComponent.yml) use the `is_deterministic: false` attribute in the yaml file. Then you can execute a new job using the updated file:
+
+```dotnetcli
+az ml job create -f company_analysis_pipeline_with_component.yml 
+```
+
+You can publish a job as a pipeline through the UI:
+
+![Publish a pipeline](./Readme.PublishPipeline.png)
+
+## Additional topics addressed in video
+
+You can install and run the linter and file formatter using the following code:
+
+```dotnetcli
+pip install black[jupyter] flake8 flake8_nb
+black .
+flake8 .
+flake8_nb .
+```
+
+> See the [setup.cfg](./setup.cfg) file for configuration options.
+
+Before using the `az` command for the first time, you will need to login and optionally set the active subscription (if you have more than one). Here are the commands shown in the video:
+
+```dotnetcli
+az --version
+az login
+az account show
+az account list
+az account set --subscription ab05...ab05
+az account show
+```
+
+## References
+
+You can read more:
+
+- https://learn.microsoft.com/azure/machine-learning/how-to-create-component-pipelines-cli
+- https://learn.microsoft.com/azure/machine-learning/tutorial-pipeline-python-sdk
+- https://github.com/Azure/azureml-examples/tree/main/sdk/python/assets/environment
@@ -0,0 +1,17 @@
+$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
+type: command
+
+is_deterministic: false
+name: company_analysis
+display_name: Analysis of a company
+code: ./Code
+command: python analysis.py --company-code ${{inputs.company_code_from_component}} --input-folder ${{inputs.input_folder}} --output-folder ${{outputs.output_folder_from_component}} 
+environment: azureml:analysis-environment-py38@latest
+inputs:
+    company_code_from_component: 
+        type: string
+    input_folder: 
+        type: path
+outputs:
+    output_folder_from_component:
+        type: path
@@ -0,0 +1,25 @@
+$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
+type: pipeline
+display_name: company_analysis
+experiment_name: Notebook2Pipeline
+compute: azureml:cpu-cluster
+  
+inputs:
+  company_code: "CompanyA"
+  input_folder: 
+    type: uri_folder
+    path: azureml://datastores/workspaceblobstore/paths/Companies/
+outputs:
+  output_folder:
+    type: uri_folder
+    path: azureml://datastores/workspaceblobstore/paths/Outputs/
+jobs:
+  analysis:
+    code: ./Code
+    command: python analysis.py --company-code ${{inputs.company_code}} --input-folder ${{inputs.input_folder}} --output-folder ${{outputs.output_folder}}  
+    environment: azureml:analysis-environment@latest
+    inputs:
+      company_code: ${{parent.inputs.company_code}}
+      input_folder: ${{parent.inputs.input_folder}}
+    outputs:
+      output_folder: ${{parent.outputs.output_folder}}
@@ -0,0 +1,29 @@
+$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
+type: pipeline
+
+display_name: company_analysis
+description: Pipeline to analyze a company
+experiment_name: Notebook2Pipeline
+
+compute: azureml:cpu-cluster
+
+inputs:
+  company_code: "CompanyA"
+
+outputs:
+  output_folder:
+    type: uri_folder
+    mode: rw_mount
+    path: azureml://datastores/workspaceblobstore/paths/Outputs/
+
+jobs:
+  analysis:
+    type: command
+    component: ./companyAnalysisComponent.yml
+    inputs:
+      company_code_from_component: ${{parent.inputs.company_code}}
+      input_folder: 
+        type: uri_folder
+        path: azureml://datastores/workspaceblobstore/paths/Companies/
+    outputs:
+      output_folder_from_component: ${{parent.outputs.output_folder}}
@@ -0,0 +1,4 @@
+$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
+name: analysis-environment
+build:
+  path: DockerContext