From c52670884dab428ed79356a9d858f22cb3f41255 Mon Sep 17 00:00:00 2001
From: Kishan Savant <66986430+NeoKish@users.noreply.github.com>
Date: Wed, 17 Aug 2022 06:54:50 +0530
Subject: [PATCH] Example of converting H&M Fashion Prediction Kaggle notebook
to a Kubeflow pipeline (#976)
* Added files via upload
* Modified Kale NB
* Updated the README.md and Kale NB
* Updated README.md
* Added a file size check for downloaded files in KaleNB
* Fixed minor grammatical errors
* Fixed a minor grammatical error
* Updated the download link for house-prices kfp notebook
* Fixed the error in Kale NB
---
h-and-m-fash-rec-kaggle-competition/README.md | 179 +++++
.../h&m-fash-rec-kale.ipynb | 675 ++++++++++++++++++
.../h&m-fash-rec-kfp.ipynb | 614 ++++++++++++++++
.../h&m-fash-rec-orig.ipynb | 439 ++++++++++++
.../images/kaggle_api_token.PNG | Bin 0 -> 13345 bytes
.../images/kale_cell_metadata.PNG | Bin 0 -> 18886 bytes
.../images/kale_deployment_panel.PNG | Bin 0 -> 57775 bytes
.../images/kale_pipeline_graph.PNG | Bin 0 -> 6504 bytes
.../images/kfp_client.PNG | Bin 0 -> 6651 bytes
.../images/kfp_pipeline_func.PNG | Bin 0 -> 25275 bytes
.../images/kfp_pipeline_graph.PNG | Bin 0 -> 6426 bytes
.../requirements.txt | 5 +
.../resource.yaml | 16 +
.../house-prices-kfp.ipynb | 2 +-
14 files changed, 1929 insertions(+), 1 deletion(-)
create mode 100644 h-and-m-fash-rec-kaggle-competition/README.md
create mode 100644 h-and-m-fash-rec-kaggle-competition/h&m-fash-rec-kale.ipynb
create mode 100644 h-and-m-fash-rec-kaggle-competition/h&m-fash-rec-kfp.ipynb
create mode 100644 h-and-m-fash-rec-kaggle-competition/h&m-fash-rec-orig.ipynb
create mode 100644 h-and-m-fash-rec-kaggle-competition/images/kaggle_api_token.PNG
create mode 100644 h-and-m-fash-rec-kaggle-competition/images/kale_cell_metadata.PNG
create mode 100644 h-and-m-fash-rec-kaggle-competition/images/kale_deployment_panel.PNG
create mode 100644 h-and-m-fash-rec-kaggle-competition/images/kale_pipeline_graph.PNG
create mode 100644 h-and-m-fash-rec-kaggle-competition/images/kfp_client.PNG
create mode 100644 h-and-m-fash-rec-kaggle-competition/images/kfp_pipeline_func.PNG
create mode 100644 h-and-m-fash-rec-kaggle-competition/images/kfp_pipeline_graph.PNG
create mode 100644 h-and-m-fash-rec-kaggle-competition/requirements.txt
create mode 100644 h-and-m-fash-rec-kaggle-competition/resource.yaml
diff --git a/h-and-m-fash-rec-kaggle-competition/README.md b/h-and-m-fash-rec-kaggle-competition/README.md
new file mode 100644
index 000000000..82a62d1c7
--- /dev/null
+++ b/h-and-m-fash-rec-kaggle-competition/README.md
@@ -0,0 +1,179 @@
+# Kaggle Featured Prediction Competition: H&M Personalized Fashion Recommendations
+
+In this [competition](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations), product recommendations have to be done based on previous purchases. There's a whole range of data available including customer meta data, product meta data, and meta data that spans from simple data, such as garment type and customer age, to text data from product descriptions, to image data from garment images.
+
+In this notebook we will be working with implicit's ALS library for our recommender systems. Please do check out the [docs](https://benfred.github.io/implicit/index.html) for more information.
+
+## Prerequisites for Building the Kubeflow Pipeline
+
+If you don’t already have Kubeflow up and running, we recommend signing up for a free trial of Arrikto's [Kubeflow as a Service](https://www.arrikto.com/kubeflow-as-a-service/). For the following example, we are using Kubeflow as a Service, but you should be able to run this example on any Kubeflow distribution.
+
+## Testing environment
+
+| Name | version |
+| ------------- |:-------------:|
+| Kubeflow | v1.4 |
+| kfp | 1.8.11 |
+| kubeflow-kale | 0.6.0 |
+
+## Initial Steps
+
+1. Please follow the Prerequisites section to get Kubeflow running.
+2. Create a new Jupyter Notebook server with following resources
+ - CPU : 1
+ - RAM : 32GB
+ - Workspace Volume : 50GB
+3. Once you have the Jupyter Notebook server running, connect to it.
+4. Clone this repo from the Terminal, so you have access to this directory.
+5. Now before heading to Vanilla KFP steps, we need to save our Kaggle API credentials as a secret so that we can use the Kaggle Public [API](https://github.com/Kaggle/kaggle-api/blob/master/kaggle/api/kaggle_api_extended.py) to download the files from the Kaggle competition for our KFP/Kale pipeline. Following are the steps:
+ - If you are not a Kaggle user, you will first need to create a Kaggle account. After creation of the account, go to your Kaggle Account page and scroll down to API section.
+
+
+
+
+
+ - Click on Create New API Token. A new API token in the form of kaggle.json file will be created which you can save locally. The kaggle.json file contains your Kaggle username and key.
+ - Once you have the API credentials, run the following command in the terminal with the username and key from the kaggle.json file that you just saved.
+
+ ```
+ kubectl create secret generic kaggle-secret --from-literal=KAGGLE_USERNAME= --from-literal=KAGGLE_KEY=
+
+ ```
+ This creates a secret for our credentials which can then be mounted on our pods.
+
+ - Next create a yaml file with the following code in it. This would then be used to create a pod-default resource to mount the secret to any pod with a specific label(in our case kaggle-secret =true)
+
+ ```
+ apiVersion: "kubeflow.org/v1alpha1"
+ kind: PodDefault
+ metadata:
+ name: kaggle-access
+ spec:
+ selector:
+ matchLabels:
+ kaggle-secret: "true"
+ desc: "kaggle-access"
+ volumeMounts:
+ - name: secret-volume
+ mountPath: /secret/kaggle
+ volumes:
+ - name: secret-volume
+ secret:
+ secretName: kaggle-secret
+
+ ```
+ - To create a pod-default resource, run the following command,
+
+ ```
+ kubectl apply -f
+
+ ```
+ You can check out the following [link](https://support.arrikto.com/hc/en-us/articles/6335158153489-Acessing-External-System-with-User-Credentials-Kaggle-Example-) for more details about accessing external system with user credentials.
+6. With the completion of 5th step, you are good to start with Vanilla KFP steps.
+
+
+
+## Vanilla KFP version
+
+To start building out a Kubeflow pipeline, you need to get yourself acquainted with the Kubeflow Pipelines [documentation](https://www.kubeflow.org/docs/components/pipelines/sdk/build-pipeline/) to understand what the pipelines are, its components, what goes into these components. There are different ways to build out a pipeline component as mentioned [here](https://www.kubeflow.org/docs/components/pipelines/sdk/build-pipeline/#building-pipeline-components). In the following example, we are going to use the [lightweight python functions](https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/) based components for building up the pipeline.
+
+### Step 1: Install the Kubeflow Pipeline SDK and import the required kfp packages to run the pipeline
+
+From kfp, we will be using [func_to_container_op](https://kubeflow-pipelines.readthedocs.io/en/stable/source/kfp.components.html#kfp.components.func_to_container_op) which would help in building the factory function from the python function and we will use [InputPath](https://kubeflow-pipelines.readthedocs.io/en/stable/source/kfp.components.html#kfp.components.InputPath) and [OutputPath](https://kubeflow-pipelines.readthedocs.io/en/stable/source/kfp.components.html#kfp.components.OutputPath) from the components package to pass the paths of the files or models to these tasks. The [passing of data](https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/#pass-data) is being implemented by kfp’s supported data passing mechanism. InputPath and OutputPath is how you pass on the data or model between the components. For [passing values](https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/#passing-parameters-by-value), we are using NamedTuples which allows us to send multiple values between components.
+
+### Step 2: Next build out the pipeline components
+
+Our Kubeflow pipeline is broken down into five pipeline components:
+
+- Download the data from Kaggle
+- Load and Preprocess the data
+- Creating Sparse Matrix
+- Train data
+- Predictions
+
+We convert each python function to a factory function using the func_to_container_op which will then be converted to a pipeline task for our pipeline function.
+
+### Step 3 : Creating pipeline function
+
+After building all the pipeline components, we have to define a pipeline function connecting all the pipeline components with appropriate inputs and outputs. This when run would generate the pipeline graph.
+
+Pipeline function:
+
+
+
+
+
+
+### Step 4 : Running the pipeline using the kfp.client instance
+
+There are different ways to run the pipeline function as mentioned in the [documentation](https://www.kubeflow.org/docs/components/pipelines/sdk/build-pipeline/#compile-and-run-your-pipeline). We would run the pipeline using the Kubeflow Pipelines SDK client.
+
+
+
+
+
+Once all the cells are executed successfully, you should see two hyperlinks ‘Experiment details’ and ‘Run details’. Click on ‘Run details’ link to observe the pipeline running.
+
+The final pipeline graph would look as follow:
+
+
+
+
+
+## Kale KFP version
+
+For the Kaggle notebook example, we are using [Kubeflow as a Service](https://www.arrikto.com/kubeflow-as-a-service/). If you are using Kubeflow as a Service then Kale comes preinstalled. For users with a different Kubeflow setup, you can refer to the [GitHub link](https://github.com/kubeflow-kale/kale#getting-started) for installing the Kale JupyterLab extension on your setup.
+
+### Step 1: Install all the required packages
+
+Run the first code cell to install all the required packages (not available under the standard python library) by using the requirements.txt file. Restart the kernel after installation.
+
+### Step 2: Download the data from Kaggle
+
+Run the second code cell to download the relevant data from Kaggle using the Kaggle Public API. You will require the API credentials from the kaggle.json file you got earlier in the Initial Steps. For the Kale notebook version, you don't have to create the secret, just need the API credentials to download the data. Once the code cell is run, you should see a new "data" directory being created with the zip files downloaded and unzipped. Please ensure that you run the cell only once so you don't create nested directories. Restart the kernel before running the code cell again.
+
+### Step 3: Annotate the notebook with Kale tags
+
+The Kale notebook in the directory is already annotated. To see the annotations, open up the Kale Deployment panel and click on the Enable switch button. Once you have it switched on, you should see the following:
+
+
+
+
+
+Please take time to understand how each cell is annotated by clicking on the cell and checking out the tag being used and what are is its dependencies. Kale provides us with six tags for annotations:
+
+- Imports
+- Functions
+- Pipeline Parameters
+- Pipeline Metrics
+- Pipeline Step
+- Skip Cell
+
+You can also see the tags being created by checking out the Cell Metadata by clicking on the Property Inspector above the Kale Deployment Panel button.
+
+
+
+
+
+### Step 2: Run the Kubeflow Pipeline
+
+Once you’ve tagged your notebook, click on the “Compile and Run” button in the Kale widget. Kale will perform the following tasks for you:
+
+- Validate the notebook
+- Take a snapshot
+- Compile the notebook
+- Upload the pipeline
+- Run the pipeline
+
+In the “Running pipeline” output, click on the “View” hyperlink. This will take you directly to the runtime execution graph where you can watch your pipeline execute and update in real-time.
+
+
+
+
+
+## Note:
+Both notebooks have been tested out and the whole pipeline run for both the Vanilla KFP and the Kale KFP versions take around 2hrs. Most of the time is being consumed in the predictions pipeline stage. In case of any error, please test out with the following docker image.
+
+Notebook server docker image used: gcr.io/arrikto/jupyter-kale-py36@sha256:dd3f92ca66b46d247e4b9b6a9d84ffbb368646263c2e3909473c3b851f3fe198
+
+If the error persists, please raise an issue.
diff --git a/h-and-m-fash-rec-kaggle-competition/h&m-fash-rec-kale.ipynb b/h-and-m-fash-rec-kaggle-competition/h&m-fash-rec-kale.ipynb
new file mode 100644
index 000000000..007261ef6
--- /dev/null
+++ b/h-and-m-fash-rec-kaggle-competition/h&m-fash-rec-kale.ipynb
@@ -0,0 +1,675 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "# Kaggle Featured Prediction Competition: H&M Personalized Fashion Recommendations"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "In this [competition](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations), product recommendations have to be done based on previous purchases. There's a whole range of data available including customer meta data, product meta data, and meta data that spans from simple data, such as garment type and customer age, to text data from product descriptions, to image data from garment images.\n",
+ "\n",
+ "In this notebook we will be working with implicit's ALS library for our recommender systems. Please do check out the [docs](https://benfred.github.io/implicit/index.html) for more information."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Install necessary packages\n",
+ "\n",
+ "We can install the necessary package by either running `pip install --user ` or include everything in a `requirements.txt` file and run `pip install --user -r requirements.txt`. We have put the dependencies in a `requirements.txt` file so we will use the former method.\n",
+ "\n",
+ "Restart the kernel after installation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "tags": [
+ "skip"
+ ]
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 1)) (1.19.5)\n",
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 2)) (1.1.5)\n",
+ "Collecting implicit\n",
+ " Downloading implicit-0.5.2-cp36-cp36m-manylinux2014_x86_64.whl (18.6 MB)\n",
+ " |████████████████████████████████| 18.6 MB 8.6 MB/s \n",
+ "\u001b[?25hCollecting sklearn\n",
+ " Downloading sklearn-0.0.tar.gz (1.1 kB)\n",
+ " Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+ "\u001b[?25hCollecting kaggle\n",
+ " Downloading kaggle-1.5.12.tar.gz (58 kB)\n",
+ " |████████████████████████████████| 58 kB 8.3 MB/s \n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+ "\u001b[?25hRequirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->-r requirements.txt (line 2)) (2021.3)\n",
+ "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.6/dist-packages (from pandas->-r requirements.txt (line 2)) (2.8.2)\n",
+ "Collecting tqdm>=4.27\n",
+ " Downloading tqdm-4.64.0-py2.py3-none-any.whl (78 kB)\n",
+ " |████████████████████████████████| 78 kB 7.2 MB/s \n",
+ "\u001b[?25hRequirement already satisfied: scipy>=0.16 in /usr/local/lib/python3.6/dist-packages (from implicit->-r requirements.txt (line 3)) (1.5.4)\n",
+ "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from sklearn->-r requirements.txt (line 4)) (0.23.2)\n",
+ "Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.6/dist-packages (from kaggle->-r requirements.txt (line 5)) (1.16.0)\n",
+ "Requirement already satisfied: certifi in /usr/local/lib/python3.6/dist-packages (from kaggle->-r requirements.txt (line 5)) (2021.10.8)\n",
+ "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from kaggle->-r requirements.txt (line 5)) (2.27.1)\n",
+ "Collecting python-slugify\n",
+ " Downloading python_slugify-6.1.2-py2.py3-none-any.whl (9.4 kB)\n",
+ "Requirement already satisfied: urllib3 in /usr/local/lib/python3.6/dist-packages (from kaggle->-r requirements.txt (line 5)) (1.26.8)\n",
+ "Requirement already satisfied: importlib-resources in /usr/local/lib/python3.6/dist-packages (from tqdm>=4.27->implicit->-r requirements.txt (line 3)) (5.4.0)\n",
+ "Collecting text-unidecode>=1.3\n",
+ " Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)\n",
+ " |████████████████████████████████| 78 kB 10.0 MB/s \n",
+ "\u001b[?25hRequirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.6/dist-packages (from requests->kaggle->-r requirements.txt (line 5)) (2.0.10)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->kaggle->-r requirements.txt (line 5)) (3.3)\n",
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->sklearn->-r requirements.txt (line 4)) (3.0.0)\n",
+ "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->sklearn->-r requirements.txt (line 4)) (1.1.0)\n",
+ "Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.6/dist-packages (from importlib-resources->tqdm>=4.27->implicit->-r requirements.txt (line 3)) (3.6.0)\n",
+ "Building wheels for collected packages: sklearn, kaggle\n",
+ " Building wheel for sklearn (setup.py) ... \u001b[?25ldone\n",
+ "\u001b[?25h Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=2381 sha256=b097d9ed6edd1c879d42180dffc20aa2659b2628f9711ae4204ce4b1cb636dc2\n",
+ " Stored in directory: /home/jovyan/.cache/pip/wheels/23/9d/42/5ec745cbbb17517000a53cecc49d6a865450d1f5cb16dc8a9c\n",
+ " Building wheel for kaggle (setup.py) ... \u001b[?25ldone\n",
+ "\u001b[?25h Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73994 sha256=6e228faf71a3c29b864d64cfe8fe5b59a1257cfc4afb24f7c4712943489a5710\n",
+ " Stored in directory: /home/jovyan/.cache/pip/wheels/77/47/e4/44a4ba1b7dfd53faaa35f59f1175e123b213ff401a8a56876b\n",
+ "Successfully built sklearn kaggle\n",
+ "Installing collected packages: text-unidecode, tqdm, python-slugify, sklearn, kaggle, implicit\n",
+ "Successfully installed implicit-0.5.2 kaggle-1.5.12 python-slugify-6.1.2 sklearn-0.0 text-unidecode-1.3 tqdm-4.64.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pip install --user -r requirements.txt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Download Data from Kaggle"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Download relevant data from kaggle by running the below code cell. Follow the initial steps information mentioned in Github README.md to get the Kaggle username and key for authentication of Kaggle Public API. There's no need of secret to be created for the following step. The credentials will be present in the kaggle.json file. This cell needs to be run before starting Kale pipeline from Kale deployment panel. Please ensure that you run the cell only once so you don't create nested directories. Restart the kernel before running the code cell again. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "tags": [
+ "skip"
+ ]
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "customers.csv.zip: Skipping, found more recently modified local copy (use --force to force download)\n",
+ "transactions_train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)\n",
+ "articles.csv.zip: Skipping, found more recently modified local copy (use --force to force download)\n",
+ "sample_submission.csv.zip: Skipping, found more recently modified local copy (use --force to force download)\n",
+ "Unzipping the files ...\n",
+ "\n",
+ "\n",
+ "Checking the files are extracted properly ...\n",
+ "\n",
+ "\n",
+ "sample_submission.csv 258MB 258MB\n",
+ "customers.csv 198MB 198MB\n",
+ "transactions_train.csv 3GB 3GB\n",
+ "articles.csv 34MB 34MB\n",
+ "All files are downloaded and unzipped inside the data directory. Please move on to next step\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "\n",
+ "\n",
+ "# Get the Kaggle Username and password from the kaggle.json file\n",
+ "# and paste it in place of KAGGLE_USERNAME AND KAGGLE_KEY on right hand side\n",
+ "\n",
+ "os.environ['KAGGLE_USERNAME'] = \"KAGGLE_USERNAME\"\n",
+ "os.environ['KAGGLE_KEY'] = \"KAGGLE_KEY\"\n",
+ "\n",
+ "path = \"data/\"\n",
+ "\n",
+ "os.chdir(os.getcwd())\n",
+ "os.system(\"mkdir \" + path)\n",
+ "os.chdir(path)\n",
+ "\n",
+ "import kaggle\n",
+ "from kaggle.api.kaggle_api_extended import KaggleApi\n",
+ "api = KaggleApi()\n",
+ "api.authenticate()\n",
+ "\n",
+ "# Getting the files list from Kaggle using Kaggle api\n",
+ "file_list = api.competition_list_files('h-and-m-personalized-fashion-recommendations')\n",
+ "\n",
+ "# Download the required files individually. You can also choose to download the entire dataset if you want to work with image data as well. The files will be in downloaded \n",
+ "api.competition_download_file('h-and-m-personalized-fashion-recommendations','customers.csv')\n",
+ "api.competition_download_file('h-and-m-personalized-fashion-recommendations','transactions_train.csv')\n",
+ "api.competition_download_file('h-and-m-personalized-fashion-recommendations','articles.csv')\n",
+ "api.competition_download_file('h-and-m-personalized-fashion-recommendations','sample_submission.csv') \n",
+ "\n",
+ "print(\"Unzipping the files ...\")\n",
+ "\n",
+ "# Get the path of the directory where the files are downloaded\n",
+ "path_dir = os.getcwd()\n",
+ "\n",
+ "from zipfile import ZipFile \n",
+ "\n",
+ "# Extracting all files from individual zip files\n",
+ "zipfile1 = ZipFile(path_dir + '/customers.csv.zip', 'r')\n",
+ "zipfile1.extract(\"customers.csv\")\n",
+ "zipfile1.close()\n",
+ " \n",
+ "zipfile2 = ZipFile(path_dir + '/transactions_train.csv.zip', 'r')\n",
+ "zipfile2.extract(\"transactions_train.csv\")\n",
+ "zipfile2.close()\n",
+ " \n",
+ "zipfile3 = ZipFile(path_dir + '/articles.csv.zip', 'r')\n",
+ "zipfile3.extract(\"articles.csv\")\n",
+ "zipfile3.close()\n",
+ " \n",
+ "zipfile4 = ZipFile(path_dir + '/sample_submission.csv.zip', 'r')\n",
+ "zipfile4.extract(\"sample_submission.csv\")\n",
+ "zipfile4.close()\n",
+ "\n",
+ "print(\"Checking the files are extracted properly ...\")\n",
+ "\n",
+ "for file in os.listdir(path_dir):\n",
+ " filename = os.fsdecode(file)\n",
+ " if filename.endswith(\".csv\"):\n",
+ " file_size = os.path.getsize(path_dir + \"/\" + filename)\n",
+ " if file_size< 1e9:\n",
+ " file_size = str(round(file_size/(1024*1024))) + \"MB\"\n",
+ " else:\n",
+ " file_size = str(round(file_size/(1024*1024*1024))) + \"GB\"\n",
+ " for file in file_list:\n",
+ " if file.name == filename and file.size == file_size:\n",
+ " print(file.name,file.size, file_size)\n",
+ "\n",
+ "print(\"All files are downloaded and unzipped inside the data directory. Please move on to next step\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "imports"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import implicit \n",
+ "import scipy.sparse as sparse"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Load Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "block:load_and_preprocess_data"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "path = \"data/\"\n",
+ "train_data_filepath = path + \"transactions_train.csv\"\n",
+ "article_metadata_filepath = path + \"articles.csv\"\n",
+ "customer_metadata_filepath = path + \"customers.csv\"\n",
+ "test_data_filepath = path + \"sample_submission.csv\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "train_data = pd.read_csv(train_data_filepath)\n",
+ "test_data = pd.read_csv(test_data_filepath)\n",
+ "customer_data = pd.read_csv(customer_metadata_filepath)\n",
+ "article_data = pd.read_csv(article_metadata_filepath)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Exploring the dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "skip"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "train_data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "skip"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "train_data.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "skip"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "train_data.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "skip"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "test_data.tail()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "skip"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "customer_data.tail()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "skip"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "article_data.tail()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# We will be dropping t_dat, sales_channel and price as this won't be part of the recommendation system we will be building \n",
+ "train_data.drop(['t_dat','sales_channel_id','price'], axis= 1, inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "skip"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "train_data.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Preprocess Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "block:"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# create a new purchase count column that would gives us count of every article bought by the customers\n",
+ "X = train_data.groupby(['customer_id', 'article_id'])['article_id'].count().reset_index(name = \"purchase_count\") \n",
+ "\n",
+ "# Getting unique number of customers and articles using the customer and article metadata data files\n",
+ "unique_customers = customer_data['customer_id'].unique()\n",
+ "unique_articles = article_data['article_id'].unique()\n",
+ "\n",
+ "# length of the customers and articles\n",
+ "n_customers = len(unique_customers)\n",
+ "n_articles = len(unique_articles)\n",
+ "\n",
+ "# Create a mapping for customer_id to convert it from an object column to an int column for the sparse matrix creation\n",
+ "customer_id_dict = {unique_customers[i]:i for i in range(len(unique_customers))}\n",
+ "reverse_customer_id_dict = {i:unique_customers[i] for i in range(len(unique_customers))} \n",
+ "numeric_cus_id = []\n",
+ "for i in range(len(X['customer_id'])):\n",
+ " numeric_cus_id.append(customer_id_dict.get(X['customer_id'][i]))\n",
+ "X['customer_id'] = numeric_cus_id\n",
+ "\n",
+ "# Create a mapping for article_id so that the sparse matrix creation doesn't get large enough due to long int values of article_ids\n",
+ "article_id_dict = {unique_articles[i]:i for i in range(len(unique_articles))}\n",
+ "reverse_article_id_dict = {i:unique_articles[i] for i in range(len(unique_articles))}\n",
+ "numeric_art_id = []\n",
+ "for i in range(len(X['article_id'])):\n",
+ " numeric_art_id.append(article_id_dict.get(X['article_id'][i]))\n",
+ "X['article_id'] = numeric_art_id"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "skip"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "X.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Sparse Matrix Creation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "block:sparse_matrix_creation",
+ "prev:load_and_preprocess_data"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Constructing sparse matrices for alternating least squares algorithm \n",
+ "sparse_user_item_coo = sparse.coo_matrix((X.purchase_count, (X.customer_id, X.article_id)), shape = (n_customers, n_articles))\n",
+ "sparse_user_item_csr = sparse.csr_matrix((X['purchase_count'], (X['customer_id'], X['article_id'])), shape = (n_customers, n_articles))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "skip"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "sparse_user_item_csr"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Model Training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "block:train_model",
+ "prev:sparse_matrix_creation"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# parameters for the model\n",
+ "als_params = dict(\n",
+ " factors = 200, # number of latent factors - try between 50 to 1000\n",
+ " regularization = 0.01, # regularization factor - try between 0.001 to 0.2\n",
+ " iterations = 5, # iterations - try between 2 to 100\n",
+ ")\n",
+ "\n",
+ "# initialize a model\n",
+ "model = implicit.als.AlternatingLeastSquares(**als_params)\n",
+ "\n",
+ "# train the model on a sparse matrix of user/item/confidence weights \n",
+ "model.fit(sparse_user_item_csr)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "skip"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Predictions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "block:predictions",
+ "prev:sparse_matrix_creation",
+ "prev:train_model",
+ "prev:load_and_preprocess_data"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "predictions=[]\n",
+ "count = 0\n",
+ "for cust_id in test_data.customer_id:\n",
+ " cust_id = customer_id_dict.get(cust_id)\n",
+ " if(cust_id!=None): \n",
+ " recommendations = model.recommend(cust_id, sparse_user_item_csr[cust_id],10)\n",
+ " result=[]\n",
+ " for i in range(len(recommendations[0])):\n",
+ " val = reverse_article_id_dict.get(recommendations[0][i])\n",
+ " result.append(val) \n",
+ " predictions.append(result)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_data['prediction'] = predictions\n",
+ "test_data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Final Submission"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "tags": [
+ "skip"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "test_data.to_csv('data/submission.csv', index=False)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "kubeflow_notebook": {
+ "autosnapshot": true,
+ "docker_image": "gcr.io/arrikto/jupyter-kale-py36@sha256:dd3f92ca66b46d247e4b9b6a9d84ffbb368646263c2e3909473c3b851f3fe198",
+ "experiment": {
+ "id": "new",
+ "name": "hm-fash-recomm"
+ },
+ "experiment_name": "hm-fash-recomm",
+ "katib_metadata": {
+ "algorithm": {
+ "algorithmName": "grid"
+ },
+ "maxFailedTrialCount": 3,
+ "maxTrialCount": 12,
+ "objective": {
+ "objectiveMetricName": "",
+ "type": "minimize"
+ },
+ "parallelTrialCount": 3,
+ "parameters": []
+ },
+ "katib_run": false,
+ "pipeline_description": "",
+ "pipeline_name": "predict-hm-purchases-kale-1",
+ "snapshot_volumes": true,
+ "steps_defaults": [
+ "label:access-ml-pipeline:true",
+ "label:access-rok:true"
+ ],
+ "volume_access_mode": "rwm",
+ "volumes": [
+ {
+ "annotations": [],
+ "mount_point": "/home/jovyan",
+ "name": "hm-test2-workspace-6vjtz",
+ "size": 50,
+ "size_type": "Gi",
+ "snapshot": false,
+ "type": "clone"
+ }
+ ]
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/h-and-m-fash-rec-kaggle-competition/h&m-fash-rec-kfp.ipynb b/h-and-m-fash-rec-kaggle-competition/h&m-fash-rec-kfp.ipynb
new file mode 100644
index 000000000..7c9c9fdcf
--- /dev/null
+++ b/h-and-m-fash-rec-kaggle-competition/h&m-fash-rec-kfp.ipynb
@@ -0,0 +1,614 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "# Kaggle Featured Prediction Competition: H&M Personalized Fashion Recommendations"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "In this [competition](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations), product recommendations have to be done based on previous purchases. There's a whole range of data available including customer meta data, product meta data, and meta data that spans from simple data, such as garment type and customer age, to text data from product descriptions, to image data from garment images.\n",
+ "\n",
+ "In this notebook we will be working with implicit's ALS library for our recommender systems. Please do check out the [docs](https://benfred.github.io/implicit/index.html) for more information"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Install the kfp package by uncommenting the below line and restarting the kernel. Do comment it out once the kernel is restarted"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Install the kfp \n",
+ "# !pip install kfp --upgrade "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Following are the imports required to build the pipeline and pass the data between components for building up the kubeflow pipeline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import kfp\n",
+ "from kfp.components import func_to_container_op\n",
+ "import kfp.components as comp\n",
+ "from typing import NamedTuple"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "All the essential imports required in a pipeline component are put together in a list which then is passed on to each pipeline component. Though this might not be efficient when you are dealing with lot of packages, so in cases with many packages and dependencies you can go for docker image which then can be passed to each pipeline component"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import_packages = ['pandas', 'sklearn', 'implicit', 'kaggle', 'numpy', 'pyarrow']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "In the following implementation of kubeflow pipeline we are making use of [lightweight python function components](https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/) to build up the pipeline. The data is passed between component instances(tasks) using InputPath and OutputPath. Different ways of storing and passing data between the pipelines have been explored in the following notebook."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "The pipeline is divided into five components\n",
+ "\n",
+ " 1. Download data from Kaggle\n",
+ " 2. Load and preprocess the data\n",
+ " 3. Creating sparse matrix\n",
+ " 4. Train model\n",
+ " 5. Predictions"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Download the data from Kaggle"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Follow the prerequisites information in the Github README.md on how to create a secret for our credentials and mounting them to our pod using a pod-default resource. Once you have the secret mounted, you can use it to acccess the Username and key to download the files you need from kaggle. For the following competition, we have downloaded the files required instead of downloading the whole thing."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def download_kaggle_dataset(path:str)->str:\n",
+ " \n",
+ " import os\n",
+ "\n",
+ " # Retrieve the credentials from the secret mounted and \n",
+ " # bring it onto our working environment\n",
+ " with open('/secret/kaggle/KAGGLE_KEY', 'r') as file:\n",
+ " kaggle_key = file.read().rstrip()\n",
+ " with open('/secret/kaggle/KAGGLE_USERNAME', 'r') as file:\n",
+ " kaggle_user = file.read().rstrip()\n",
+ " os.environ['KAGGLE_USERNAME'] = kaggle_user \n",
+ " os.environ['KAGGLE_KEY'] = kaggle_key\n",
+ "\n",
+ " os.chdir(os.getcwd())\n",
+ " os.system(\"mkdir \" + path)\n",
+ " os.chdir(path)\n",
+ " \n",
+ " # Using Kaggle Public API to download the datasets\n",
+ " import kaggle \n",
+ " from kaggle.api.kaggle_api_extended import KaggleApi\n",
+ " \n",
+ " api = KaggleApi()\n",
+ " api.authenticate()\n",
+ " \n",
+ " # Download the required files individually. You can also choose to download the entire dataset if you want to work with images as well. \n",
+ " api.competition_download_file('h-and-m-personalized-fashion-recommendations','customers.csv')\n",
+ " api.competition_download_file('h-and-m-personalized-fashion-recommendations','transactions_train.csv')\n",
+ " api.competition_download_file('h-and-m-personalized-fashion-recommendations','articles.csv')\n",
+ " api.competition_download_file('h-and-m-personalized-fashion-recommendations','sample_submission.csv') \n",
+ " \n",
+ " return path \n",
+ " \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "download_data_op = func_to_container_op(download_kaggle_dataset, packages_to_install = import_packages)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Load and Preprocess the data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def load_and_preprocess_data(path:str, preprocess_data_path: comp.OutputPath('ApacheParquet'))->NamedTuple('Outputs', [('data_path',str),('int_list', list)]):\n",
+ " \n",
+ " \n",
+ " import pandas as pd\n",
+ " import os\n",
+ " from zipfile import ZipFile \n",
+ " from pyarrow import parquet\n",
+ " import pyarrow as pa\n",
+ " \n",
+ " # Moving to current working directory and creating a new directory\n",
+ " os.chdir(os.getcwd())\n",
+ " print(os.listdir(path))\n",
+ " os.chdir(path)\n",
+ " \n",
+ " # Extracting all files from individual zip files\n",
+ " zipfile1 = ZipFile('customers.csv.zip', 'r')\n",
+ " zipfile1.extract(\"customers.csv\")\n",
+ " zipfile1.close()\n",
+ " \n",
+ " zipfile2 = ZipFile('transactions_train.csv.zip', 'r')\n",
+ " zipfile2.extract(\"transactions_train.csv\")\n",
+ " zipfile2.close()\n",
+ " \n",
+ " zipfile3 = ZipFile('articles.csv.zip', 'r')\n",
+ " zipfile3.extract(\"articles.csv\")\n",
+ " zipfile3.close()\n",
+ " \n",
+ " zipfile4 = ZipFile('sample_submission.csv.zip', 'r')\n",
+ " zipfile4.extract(\"sample_submission.csv\")\n",
+ " zipfile4.close()\n",
+ " \n",
+ " # Converting to pandas dataframe \n",
+ " customer_data = pd.read_csv(\"customers.csv\")\n",
+ " article_data = pd.read_csv(\"articles.csv\")\n",
+ " train_data = pd.read_csv(\"transactions_train.csv\") \n",
+ " \n",
+ " # create a new purchase count column that would gives us count of every article bought by the customers\n",
+ " X = train_data.groupby(['customer_id', 'article_id'])['article_id'].count().reset_index(name = \"purchase_count\") \n",
+ "\n",
+ " # Getting unique number of customers and articles using the customer and article metadata data files\n",
+ " unique_customers = customer_data['customer_id'].unique()\n",
+ " unique_articles = article_data['article_id'].unique()\n",
+ " \n",
+ " # length of the customers and articles\n",
+ " n_customers = len(unique_customers)\n",
+ " n_articles = len(unique_articles)\n",
+ "\n",
+ " # Create a mapping for customer_id to convert it from an object column to an int column for the sparse matrix creation\n",
+ " customer_id_dict = {unique_customers[i]:i for i in range(len(unique_customers))}\n",
+ " reverse_customer_id_dict = {i:unique_customers[i] for i in range(len(unique_customers))} \n",
+ " numeric_cus_id = []\n",
+ " for i in range(len(X['customer_id'])):\n",
+ " numeric_cus_id.append(customer_id_dict.get(X['customer_id'][i]))\n",
+ " X['customer_id'] = numeric_cus_id\n",
+ "\n",
+ " # Create a mapping for article_id so that the sparse matrix creation doesn't get large enough due to long int values of article_ids\n",
+ " article_id_dict = {unique_articles[i]:i for i in range(len(unique_articles))}\n",
+ " rev_art_id_dict = {i:int(unique_articles[i]) for i in range(len(unique_articles))}\n",
+ " numeric_art_id = []\n",
+ " for i in range(len(X['article_id'])):\n",
+ " numeric_art_id.append(article_id_dict.get(X['article_id'][i]))\n",
+ " X['article_id'] = numeric_art_id\n",
+ " \n",
+ " # Convert from pandas to Arrow\n",
+ " table = pa.Table.from_pandas(X)\n",
+ " parquet.write_table(table, preprocess_data_path)\n",
+ " \n",
+ " values=[n_customers, n_articles]\n",
+ " \n",
+ " return (path, values)\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "load_and_preprocess_data_op = func_to_container_op(load_and_preprocess_data,packages_to_install = import_packages)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Creating sparse matrix"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def sparse_matrix_creation(data_path:str, list_val: list, file_path: comp.InputPath('ApacheParquet'), sparse_path: comp.OutputPath())->str:\n",
+ " \n",
+ " import pandas as pd\n",
+ " from pyarrow import parquet\n",
+ " import pyarrow as pa\n",
+ " import scipy.sparse as sparse\n",
+ " from scipy.sparse import coo_matrix\n",
+ " from pathlib import Path\n",
+ " import pickle\n",
+ " \n",
+ " X = parquet.read_pandas(file_path).to_pandas()\n",
+ " \n",
+ " n_customers = list_val[0]\n",
+ " n_articles = list_val[1]\n",
+ "\n",
+ " # Constructing sparse matrices for alternating least squares algorithm \n",
+ " sparse_user_item_coo = sparse.coo_matrix((X.purchase_count, (X.customer_id, X.article_id)), shape = (n_customers, n_articles))\n",
+ " sparse_user_item_csr = sparse.csr_matrix((X['purchase_count'], (X['customer_id'], X['article_id'])), shape = (n_customers, n_articles))\n",
+ "\n",
+ " pickle.dump(sparse_user_item_csr, open(sparse_path, 'wb'))\n",
+ " \n",
+ " return data_path "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sparse_matrix_creation_op = func_to_container_op(sparse_matrix_creation, packages_to_install = import_packages)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Train the Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def train_model(path:str, sparse_matrix_path: comp.InputPath(), model_path: comp.OutputPath())->str:\n",
+ " \n",
+ " import implicit\n",
+ " import pandas as pd\n",
+ " from pyarrow import parquet\n",
+ " import pyarrow as pa\n",
+ " import scipy.sparse as sparse\n",
+ " import pickle\n",
+ " \n",
+ " # Loading the sparse user item matrix from pickle\n",
+ " sparse_user_item_csr = pickle.load(open(sparse_matrix_path, 'rb'))\n",
+ " \n",
+ " # parameters for the model\n",
+ " als_params = dict(\n",
+ " factors = 200, # number of latent factors - try between 50 to 1000\n",
+ " regularization = 0.01, # regularization factor - try between 0.001 to 0.2\n",
+ " iterations = 5, # iterations - try between 2 to 100\n",
+ " )\n",
+ "\n",
+ " # initialize a model\n",
+ " model = implicit.als.AlternatingLeastSquares(**als_params)\n",
+ "\n",
+ " # train the model on a sparse matrix of user/item/confidence weights \n",
+ " model.fit(sparse_user_item_csr)\n",
+ " \n",
+ " pickle.dump(model, open(model_path, 'wb'))\n",
+ " \n",
+ " return path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_model_op = func_to_container_op(train_model, packages_to_install = import_packages)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Predictions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def predictions(test_path:str, model_path : comp.InputPath(), sparse_path: comp.InputPath()):\n",
+ " \n",
+ " import pandas as pd\n",
+ " import os\n",
+ " from zipfile import ZipFile \n",
+ " import pickle\n",
+ " from pyarrow import parquet\n",
+ " import pyarrow as pa\n",
+ " import scipy.sparse as sparse\n",
+ " \n",
+ " sparse_user_item_csr = pickle.load(open(sparse_path, 'rb'))\n",
+ " model = pickle.load(open(model_path, 'rb'))\n",
+ " \n",
+ " os.chdir(os.getcwd())\n",
+ " print(os.listdir(test_path))\n",
+ " os.chdir(test_path)\n",
+ "\n",
+ " # Converting to pandas dataframe \n",
+ " customer_data = pd.read_csv(\"customers.csv\")\n",
+ " article_data = pd.read_csv(\"articles.csv\") \n",
+ " test_data = pd.read_csv(\"sample_submission.csv\")\n",
+ " \n",
+ " # Getting unique number of customers and articles using the customer and article metadata data files\n",
+ " unique_customers = customer_data['customer_id'].unique()\n",
+ " unique_articles = article_data['article_id'].unique()\n",
+ " \n",
+ " # length of the customers and articles\n",
+ " n_customers = len(unique_customers)\n",
+ " n_articles = len(unique_articles)\n",
+ " \n",
+ " # Create a mapping for customer_id\n",
+ " customer_id_dict = {unique_customers[i]:i for i in range(len(unique_customers))}\n",
+ "\n",
+ " # Create a reverse mapping for article_id\n",
+ " reverse_article_id_dict = {i:int(unique_articles[i]) for i in range(len(unique_articles))}\n",
+ "\n",
+ " predictions=[]\n",
+ " count = 0\n",
+ " for cust_id in test_data.customer_id:\n",
+ " cust_id = customer_id_dict.get(cust_id)\n",
+ " if(cust_id!=None): \n",
+ " recommendations = model.recommend(cust_id, sparse_user_item_csr[cust_id],10)\n",
+ " result=[]\n",
+ " for i in range(len(recommendations[0])):\n",
+ " val = reverse_article_id_dict.get(recommendations[0][i])\n",
+ " result.append(val) \n",
+ " predictions.append(result)\n",
+ " \n",
+ " test_data['prediction'] = predictions\n",
+ " test_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prediction_op = func_to_container_op(predictions, packages_to_install = import_packages)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Defining function that implements the pipeline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def kfp_pipeline():\n",
+ " \n",
+ " vop = kfp.dsl.VolumeOp(\n",
+ " name=\"create-volume\", \n",
+ " resource_name=\"mypvc\",\n",
+ " size=\"10Gi\",\n",
+ " modes = kfp.dsl.VOLUME_MODE_RWM\n",
+ " )\n",
+ " \n",
+ " download_task = download_data_op(\"/mnt/data/\").add_pvolumes({\"/mnt\": vop.volume}).add_pod_label(\"kaggle-secret\", \"true\")\n",
+ " load_and_preprocess_data_task = load_and_preprocess_data_op(download_task.output).add_pvolumes({\"/mnt\": vop.volume})\n",
+ " sparse_matrix_task = sparse_matrix_creation_op(data_path =load_and_preprocess_data_task.outputs['data_path'], \n",
+ " file = load_and_preprocess_data_task.outputs['preprocess_data'], \n",
+ " list_val = load_and_preprocess_data_task.outputs['int_list']).add_pvolumes({\"/mnt\": vop.volume})\n",
+ " train_model_task = train_model_op(path = sparse_matrix_task.outputs['Output'], \n",
+ " sparse_matrix = sparse_matrix_task.outputs['sparse']).add_pvolumes({\"/mnt\": vop.volume})\n",
+ " prediction_task = prediction_op(test_path = train_model_task.outputs['Output'],\n",
+ " model = train_model_task.outputs['model'], \n",
+ " sparse = sparse_matrix_task.outputs['sparse']).add_pvolumes({\"/mnt\": vop.volume})\n",
+ " \n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Experiment details."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Run details."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "RunPipelineResult(run_id=61cfeaf5-5caa-49f2-89c1-c3f3e9e8d5b5)"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Using kfp.Client() to run the pipeline from notebook itself\n",
+ "client = kfp.Client() # change arguments accordingly\n",
+ "\n",
+ "# Running the pipeline\n",
+ "client.create_run_from_pipeline_func(\n",
+ " kfp_pipeline,\n",
+ " arguments={\n",
+ " })"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "kubeflow_notebook": {
+ "autosnapshot": true,
+ "docker_image": "gcr.io/arrikto/jupyter-kale-py36@sha256:dd3f92ca66b46d247e4b9b6a9d84ffbb368646263c2e3909473c3b851f3fe198",
+ "experiment": {
+ "id": "",
+ "name": ""
+ },
+ "experiment_name": "",
+ "katib_metadata": {
+ "algorithm": {
+ "algorithmName": "grid"
+ },
+ "maxFailedTrialCount": 3,
+ "maxTrialCount": 12,
+ "objective": {
+ "objectiveMetricName": "",
+ "type": "minimize"
+ },
+ "parallelTrialCount": 3,
+ "parameters": []
+ },
+ "katib_run": false,
+ "pipeline_description": "",
+ "pipeline_name": "",
+ "snapshot_volumes": true,
+ "steps_defaults": [
+ "label:access-ml-pipeline:true",
+ "label:access-rok:true"
+ ],
+ "volume_access_mode": "rwm",
+ "volumes": [
+ {
+ "annotations": [],
+ "mount_point": "/home/jovyan",
+ "name": "hm-fash-workspace-fhh9d",
+ "size": 50,
+ "size_type": "Gi",
+ "snapshot": false,
+ "type": "clone"
+ }
+ ]
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/h-and-m-fash-rec-kaggle-competition/h&m-fash-rec-orig.ipynb b/h-and-m-fash-rec-kaggle-competition/h&m-fash-rec-orig.ipynb
new file mode 100644
index 000000000..b06da44ce
--- /dev/null
+++ b/h-and-m-fash-rec-kaggle-competition/h&m-fash-rec-orig.ipynb
@@ -0,0 +1,439 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "# Kaggle Featured Prediction Competition: H&M Personalized Fashion Recommendations"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "In this [competition](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations), product recommendations have to be done based on previous purchases. There's a whole range of data available including customer meta data, product meta data, and meta data that spans from simple data, such as garment type and customer age, to text data from product descriptions, to image data from garment images."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Install necessary packages\n",
+ "\n",
+ "We can install the necessary package by either running `pip install --user ` or include everything in a `requirements.txt` file and run `pip install --user -r requirements.txt`. We have put the dependencies in a `requirements.txt` file so we will use the former method.\n",
+ "\n",
+ "Restart the kernel after installation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# !pip install --user -r requirements.txt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np # linear algebra\n",
+ "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
+ "import implicit"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "path = \"data/\"\n",
+ "train_data_filepath = path + \"transactions_train.csv\"\n",
+ "article_metadata_filepath = path + \"articles.csv\"\n",
+ "customer_metadata_filepath = path + \"customers.csv\"\n",
+ "test_data_filepath = path + \"sample_submission.csv\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_data = pd.read_csv(train_data_filepath,index_col='customer_id')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_data.drop(['t_dat','sales_channel_id','price'],axis= 1, inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_data=train_data.sort_values(by=['customer_id']).reset_index()\n",
+ "train_data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"Unique customers\",train_data['customer_id'].nunique())\n",
+ "print(\"Unique articles\",train_data['article_id'].nunique())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_data.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X = train_data.groupby(['customer_id', 'article_id'])['article_id'].count().reset_index(name = \"purchase_count\") "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unique_customers = X['customer_id'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unique_articles = X['article_id'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "customer_id_dict = {unique_customers[i]:i for i in range(len(unique_customers))}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "reverse_customer_id_dict = {i:unique_customers[i] for i in range(len(unique_customers))} "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "numeric_cus_id = []"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for i in range(len(X['customer_id'])):\n",
+ " numeric_cus_id.append(customer_id_dict.get(X['customer_id'][i]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(X['customer_id'].nunique())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "len(numeric_cus_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X['customer_id'] = numeric_cus_id"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "article_id_dict = {unique_articles[i]:i for i in range(len(unique_articles))}\n",
+ "reverse_article_id_dict = {i:unique_articles[i] for i in range(len(unique_articles))} "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "numeric_art_id = []"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for i in range(len(X['article_id'])):\n",
+ " numeric_art_id.append(article_id_dict.get(X['article_id'][i]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X['article_id'] = numeric_art_id\n",
+ "X.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Constructing sparse matrices for alternating least squares algorithm \n",
+ "import scipy.sparse as sparse\n",
+ "sparse_user_item_coo = sparse.coo_matrix((X.purchase_count, (X.customer_id, X.article_id)), shape = (n_customers, n_articles))\n",
+ "sparse_user_item_csr = sparse.csr_matrix((X['purchase_count'], (X['customer_id'], X['article_id'])), shape = (n_customers, n_articles))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# parameters for the model\n",
+ "als_params = dict(\n",
+ " factors = 200, # number of latent factors - try between 50 to 1000\n",
+ " regularization = 0.01, # regularization factor - try between 0.001 to 0.2\n",
+ " iterations = 5, # iterations - try between 2 to 100\n",
+ ")\n",
+ "\n",
+ "# initialize a model\n",
+ "model = implicit.als.AlternatingLeastSquares(**als_params)\n",
+ "\n",
+ "# train the model on a sparse matrix of user/item/confidence weights \n",
+ "model.fit(sparse_user_item_csr)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_data = pd.read_csv(test_data_filepath)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "predictions=[]\n",
+ "count = 0\n",
+ "for cust_id in test_data.customer_id:\n",
+ " cust_id = customer_id_dict.get(cust_id)\n",
+ "# if(cust_id!=None): \n",
+ " recommendations = model.recommend(cust_id, sparse_user_item_csr[cust_id],10)\n",
+ " result=[]\n",
+ " for i in range(len(recommendations[0])):\n",
+ " val = reverse_article_id_dict.get(recommendations[0][i])\n",
+ " result.append(val) \n",
+ " predictions.append(result)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_data['prediction'] = predictions\n",
+ "test_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_data.to_csv('submission.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "kubeflow_notebook": {
+ "autosnapshot": true,
+ "docker_image": "gcr.io/arrikto/jupyter-kale-py36@sha256:dd3f92ca66b46d247e4b9b6a9d84ffbb368646263c2e3909473c3b851f3fe198",
+ "experiment": {
+ "id": "",
+ "name": ""
+ },
+ "experiment_name": "",
+ "katib_metadata": {
+ "algorithm": {
+ "algorithmName": "grid"
+ },
+ "maxFailedTrialCount": 3,
+ "maxTrialCount": 12,
+ "objective": {
+ "objectiveMetricName": "",
+ "type": "minimize"
+ },
+ "parallelTrialCount": 3,
+ "parameters": []
+ },
+ "katib_run": false,
+ "pipeline_description": "",
+ "pipeline_name": "",
+ "snapshot_volumes": true,
+ "steps_defaults": [
+ "label:access-ml-pipeline:true",
+ "label:access-rok:true"
+ ],
+ "volume_access_mode": "rwm",
+ "volumes": [
+ {
+ "annotations": [],
+ "mount_point": "/home/jovyan",
+ "name": "hm-fash-workspace-fhh9d",
+ "size": 50,
+ "size_type": "Gi",
+ "snapshot": false,
+ "type": "clone"
+ }
+ ]
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/h-and-m-fash-rec-kaggle-competition/images/kaggle_api_token.PNG b/h-and-m-fash-rec-kaggle-competition/images/kaggle_api_token.PNG
new file mode 100644
index 0000000000000000000000000000000000000000..b842e8aee211eb2d409c06afefa8ef7c38fa1473
GIT binary patch
literal 13345
zcmdtJXIN897%s}*DCkx|Wh(+g+~Nk6CSAISf=E*lkgn282+~`K5(NuIq!}?FDxgv#
zEukbtL}>v61VRZ3NQaPwmIM-#Ty&rF-1FQ&_s@OKz0duTWMS~cmWPGft#Su
zJt=QI@l^TmB`KHnbnuVM&vag2Ax^FMTvdEx`y~2gwN&s)Pg(n#3&^k&J>y&d7~fAm
znk;wcX~BzD4U#T>|18&V(m?{eTwLBnay&l64jz(>iHREi6k!MdZ0{Jgao~^LDG3QZ
zZjNBPalMlDe%Jop2SxFrq7w2(v8z+9|Fdvor%eRJ75wh;_WB;s@4~gqtG^a5^!D=q
z4;pR-tx~E+d=ok#)8k`#O>uZ|2p9Ai$z*;n|GiivhwcNE`n@n2nU`z$5uVhEU_GBs
zv{YPLVb)l40e{aa38_+_#ffC2%_8An16Ebesm6jR{Hqp%gli@;n137v^)JLI)gyXR
zvpb6ow~|&1kDz~V0N2c_HNjzC@cAECf#Er4XKz$m*9ze7YEfpYbCoYO!~F!uCU2SIpsX4@
z;ykzPCF9(&|Cg=`J>b24H06;i``+J&|4?viIov<~sv0HgB}$CaF*h`(72a-Q*L8m*
zarrrOwfjP=?5{Cj0DsG)QO=0&fE_6u^JhwcEE>`MlB@LNo|mvv?C<5Wpvh1+vcgk3
zH?zeZv?mB`gjOgL0}nO)MVU3EsQss0cTCUd#*XB_`@?Wnbe+VXTvGgRaG~U55eA3+
z+_tR-vzII=iTJPkRNRekndQG_^g!spnLIdh_&=eYIsE^%hNJRmYW?e>Wq$T?Frv#%
zL&pV;4B%spkSZj8_H~aSSH7wsa0xwE@1)f3w9(XCSBYDih5kDj6?hYmlxWP%c~^#-
z-1w+AeC_XRqIyoaQ=m{*YD9{U>-V|FckD~Irr`o
zjsB7@j2R?xpGA`R7nB?YJWRPFF89jVKK7OlOm;bsUfg*8w$k|9&FUDoZ$^QEZhJ=o
z-T4C~v86X#!v1b^{1=tn$6~d!c0ah5W-HNttoM9^$M8>GN}f0oK?cVZdwcKKRbw3u
zv$=85El=_a3psmX47Mwz)T7}cjrm4_1xI5h9$rw_TS=YaW2Dj6`3FOXpYsMW1;uXl
zjb;1zdM_3ZZO*}(X13pD=#AwAHg?yd8=7JZGeh^hl
zJ3*)Ce*(eDVL+Es8;DV2_}czU_|Q&31P%zr{9_v1acKy8h15a@p(0k9e
zm0}Lr9H^BQZQR|{=G{GQ@t`xN((`D|Q@jt)^ywF0Vsn42af~4!&+YxP=tIH1#78PU%8Jz^(@>%AYEop7MZa}L8hzG**}B?0L;Ls>A$~8zHH=V}Uq`);mi$u+}*2v%UUd_g3xuVcb55n{TeN
zFSG(|sF3Zd1UiWO0Hb)^OI*O*PjFSF&<~UWW&Yf0t>%0k3eJvmQ@q~uhDF`lOjAjf
z43JH`Dri;HUuRve^NixPL-97adk35oKQCmA9;y`3_!0j)(G`p1@7grADbL!LS~=)Z
zeYlx|2~?u*B}HXB52sR}??b+mBn8bmbk%3S@u$jkW^Yb(MzCvsH&yTFdX*#GAfOtX
z8p*M62||^HxKNd~kf2Q%D`=CH7F`DhUlz$^bxm;{j(qyF*|@JOydPpGs&ui2*hZ-E
zthclDG(F>ifYDA(5Z~*b%g)kW@76eb)IVV$P_ZZcJ+t4(7)k?P;XWEu1d?@S6^3eT
zLZzh_HN8^^h#I7a$K)rHEE0{(=&&t14|Mg~Xuk+7lUABgB591NI}&`y
zn`CknJVZ+b_;fy%H8-UDBn?wo!R)4k3QK1aUsq_1Fj9wAvAjqd%qz62$#E$lylN3{
z0~7((>!>S;N_0O;u(l)A_=Y&^&v<*1wMj`K7%cd5lQ;|aq?Q*fy`XOQ%49zG#)zq7
zHvg_~hR4|;KVDmPcSW1)wevt-dqb0g`1{WSxgP^ZR!cbVY^d4>BJ<3Lbu}s8!8;+@U<5F6xRZmG4Euk9%q6cv5Pj@
z)%LcXizxc5xuLn?G*r}g>E{V>9kBlo$a3ulW>(I;4CSIsf>(hSI^ok)aky?CoZUmweuUTwDj>54~qXWTe)61i!Ku;@$RJ!
zMntr(iBI;E&RWt+jzPgt;6aO3UEiVvQVzGK_Qr^^<~QlIdM;`{rL8vj1OV0B`!0(Tm`POFkz+$ZBW$HJ>
zlgs%(V{EX)zD2Tiz}dY|6lnqCAxiNHvz=C0^!&+p>^T!`(m|4aX6IptQY+PL%wA29
z@lHHVN4SO-F%PayYl-j#47V6Ec>_!~
z%B;)QI<`f-5H=+3RhI+@8bzQ#f;!)geG-4wq8;9aW=tZNe_3VciR_ouT=HaE#eI{1
z{Ck)yeeDdy`SsvCD5y99)1qBE>s8dYYc5TgMAc
zhnTXQW^0B0+yT{$1e9((=|f(1&CT4oVDIqRxjJyczz4{8wlLDTmw#Vz6x0EP(fT%V
z?W1?azj-q9JyCXA3!g||5*8dBOKps!Vbvx(=nP?m9V{jdZE}c;ZKP(8SX5Sc*(3pa
z=nG5U-hId{g$cXKA|zFVx6r0ID$)71AW6^^!nwvaBZ#-|)Z{Pp;sFITxxo3YnM>z&
zec9v>D@g??K`%fV1OA)>wIcaPdh~GT8w-cT+jZEn8a2DddW+gc1w0RSgpKg$CiJCY
z_93R!;W)z?H(655UkN<{Bi@wFD9N|BfU7I-drShqs!#TGDfn^BU${dF}BoeU^nW0Il^eo
zB9eoSuLF`>*RJAbEc+T{`VeFHp)T1IiOhouL9HaG9d6$pS43C4FxK&up*_z%|Db0?
z(_8T5!F{aT#cH{x1rHhCG1u9iIQc_8MtD0|`bXTs{y>M}UhywMx+N*g9hgDjpDK0W#JDB>Dpc1%q
zr=o%%{IgH<-;me3ZhW<2t`M0X&)Trrm|
zVc^_BC=Kir(JfOSQdSt`y-qkOg$b8N6f9n>xVikTn0ZWYwD)1>7322{xh*Zo6M@QZ
zEn(MtsEqpqO>?eH$Ma+$O71KT{c{*Lgn}0bo?ag_h)SE~BjHnnGu>Y7QMd{}F8K7~GE@|_m
zE2^N>`w32kv=VvB*^&@FA1X(>zH8bkJ!ttimshEZz{LIbx9T-L1K`@r_CL7c@5D}d
z-{i;7G4;4fwZE;yHrQ0mw>9>Ne(`)QDcrx+&{a?0*P062jZl8%MV2H8!hId_PIJJN
z$BGvJ9xOYY26%5p1<;-2jfOJ`ber|j=Dd5}sdfxvAjODglpIgg)%_!ACE)$!(!+4j
z7f?4_t|qj
z4>g%a(;I?I7#2Qk5}JolwVuEIX#s0J@#zHCls??
zBJkh&jdq(4c2Pyr#>cuAN&3v3VBDi9_Fu@Q@-`2i=1h#W8?m_2!jQvUkt@>CF9fR#
zE;KXa^$0OvuwBMUPiw(#V@V3&f5-ih#-wq&fj&KeF*=!`?qpB*FjF0R@6i#RX*n)!
z>N>{ul`9+9wHhl`=!nHvl_mUWF1O^^+2G#rZ+>Zhd^x(1(k@#W65~1kcFoRj-1r#s
z9Xc*LN-nRO&|4Louq=zy^nztC5q6Q9n^xzR><2hx`rL=Sf6oLTmFRi_J_B#t*(pnRnMCvDrQj6tfa)ZW*8vGyr-XoVl~I4mQ}oQZEc?CV
z!y9Hd4dgPDDB0USF~K6NE4J%nA*T0NS=y>@t`G;88E*@6EcfaD1f917EtI3nt9NX0
z@}CkI&u+&cC@@WRTqE}NF$Q29d(-ZXg_0n{cQM7sq+c=jAQ|Y2<>+9ALCTVp7gHzs!7@TtT1H#S8^iDb|
zj_c)?_r0ZEOlb{lKHg+ORoL~s8a*ltFXW5Tx;A%@f9)a-?TKiU{sx5}e|vfELnQmh#|CYBhgjZaz}?hAOtvGOzr
z2#_{R(+XlIql}x6Q4?jK&p+_M=HI^^-}6;BtZUNdl$=iEnuRR*M!SuW?ZHK_j9Kx0
ztw+5X#4ookiEcgd%_R%*NRrM!~1JaT3qyhAOqquHHV
zC`+H|y0Nq5s!pE<{@D;AU0L)e?qVBF%9E7lPB3*oVl++7B`VbfPNAF`KxrB0dqBe`
z|5|W*(*XXpriIA3qUe}YcC>_}?|i!al1aaM+c+{`xh8giRvt(#Mf^a*ukQw;KU7^<
zVhMhPB&D+y76+^aKMm~2s+C_NS>C$$)~;A|I@%=uONi(_@<6<((0FM&;?Q?$B~HR`-aMB5
zrM4aaS~(#Bcz*I&_gs~KpIK5NN5_E@8~5jK+&2NSgDz$Zr#;zv@DiCc=KlfhhF0>`
zI5@;PkCWFF_K&qra82l_*R)MTjs!c@RQL~0h#6GMDMU#I$0!F2T?MgM84aZ>>hZ)2
z2yDldGQ4d6OIpCQzZ-v?zcQYo05ZNwt(Vt-dPLP(l7)Yi-ZeBv2~(Q#I9i}9N)3fg
zGl{L@=UWRbXO-ptkUaQ}M3^W)R=jvLBO^L;4PKrQe8+m`O3mbVTvmWED#P_QVuych
zCSyaR(L2xm^mvgo_QE;=w7NWt{ZCZ90xwR
zo7phofu2^|%!pV!wMm3M^sxJ!6q(<1N^`+T-F~;xV!q
zErpnj1)pvYc5XlJ2YErLi|SvP*lut>VNM_JLbt$Qpuo)&hC+$e&+i6OCq|WEGujbp
z$0zebNZNAH@d~y$Td%Q0Jun$zqN0e9Z)W!MqeoirTQ+y_k%jlXX0%ow3-K&nIuukh
z0oyY{u7(zz6l3YbhDmii9VZn2<2NYE_~fuwaeld{()nd-gyxa$tbM>CdD%#4=_lCr
zV5YZizCi^1BeY*e{!k0T=e>SXa^rtamE9tV6#qiN?|@gVKfYNDtxs%n*=
zoDAGi^pqq~V5ESPRlEgM=^m2Bgm2yPZfTr*R#6su+;dB!xm#Mj
z%1S)^WzHw{7~^TCYyMnq>4`qOo;+nX!#5u6!C2gfN0NLCM!uQ6pJwLO6!sF|FFj1Z
zEBs0tzd7Q|tA2~TAje%RoJG2EZ=%4p8yj37OT#8L6d$b{xYOyWJoA`n1)2qgckHSy
z^s3w(!r2poJFzgc{yRErD7sRX_k!GjIw3+9!0J>v%|5+;*HJ*7p^4v!ZMb_)
z6yz%!kYk3cn#?{>80^tNs+Mjmvw^*?O1LJtn&zRSmA*t>6Q=i1U*3FHW;Yp(v_;wN
zdO{*1l+b5&qY?mC)%09oJ{;x7;rhs_^1sBX4SwB*!y%qJ-(0A5Yl^~Qf+9w3Kq
zshaBubKw$9(qYdf$
znv`~SE$cXL$Sur$;@V{2L`yB_0SX}3a_sM+l}uCJ4t-mq{uDi1`;i6m-&(Bn`5B+Y
z9Z@Vn~{tmB)TqM-j~FjN@^qnEl_7GQ(Z>Ee$*P=Sm8z#0;ej)~UO$Fl#+_0%=c7
zU~OgTRh8Cz*MSzDFHCosV*DV7X!iKe(@@k6FIe8)+K9hE{}M@Vb{%|`Z^U;t^0cg~
zdcj)hAk03l-RjqdrLfZX^;#!yXTd`p=P4RuWM2`o`Ml8E
z7@Ja^3k~q$gNYGJB51}bfE*H{c2UnDrUf?l4D><~ZNtSS{vdW8md@Cb7VCJuKO$Zz
z>?>D1G6pfG@mSbCP1-0Dw!yEoKzy(RiA+Cl8w@sx#y@lpkcsT9()iaCa
z2-3)p$>?FD61nv8d>LIE?0FyvM`R?Hne4{+X8Dvct8`4>4Fn(P?oka`dQv+V(;XzS
z;z@f3pR*&n-7_W-6?{{79jumz1c^UiOq#p0(D5jzmc|;5Kk?%1f;}b@`Mw}$Dci45
zy#VM$RbMw0wKy>I7=yWvTLC;$>^=;FrfHl7V#2v@#0y0)wC0h6Nv-QE7s|W`(bDh9
zP{MIJS!i^B`V?gg)
z+@dbI`#!t&ea@8J_@LWUc;AcC+(3gnG2^zOnZec=_UtJ@k*Ro1(jgetOYEYxE%fAlb7)QhP36lYPN6-6bZ)sWGCx4$M~3bDeSxhRS6d`o~(
zp&0k{e@VbEdjC(P_Wnl}4#k5Y96!|DWEbYpmBu*t9q7%O9fte_-|?)bGfaP4H}QYVuh@@|zVS%*lhs+W!1{S$a1kN$_;%6l<%DeQw?H
z)85e+5-BEGla4!O`wP~Gng2GjM9x*onUisz>J8L{wZw~5a$^mUf2Zj41r?FmHII5|(1jSUX
zt9hn&9XILq!b`9ar5_#F+j3$L1JSa5Edv(5gKI85q34b3Qua@V@jPXR5H=
zYPIUs6fc#Uv|5ErUki-8A7$IaNf84!*oHtaWGh!fYSy{w
z1RVcEf4fAj?1+j|*_-`_;T=62k4^i!n``|zRMcFIxz;-MAHCR)`A>v{ucPF={$hwU
z#N4;=G-7=^L8$Q*xKiTTl5uNWdK7LFc_(}aIdR)18wbF5TSMMtkuAU8yvY6c+-TJ`*Zg>%*IGyTUx%`MyfVe?HVWVAQdgsXWw38=Xi@UaFZhHmpTP-j1-nh{GHnV6f+Bi#Kaz&>(j3PFYUhjT@%7t{`V{h;ym
z0$%&i-AQr1I+8xpM7O^_+BgM`3Duw3OjrRq&1MLOY8CPYwTIC(BY{@oI6Uy?!eNKW
z7qU_q#uIF0&k&*6t4prxw~M=2YAlKN-OX-#!CyLI2jG7nprm-tfqzOwgzGmL8!EX|
zu-XdP^LGa^@8lZyx|;&9=XSP&_5XGYy~S;9E;+Q~D!7)!<4WtxZi96h*AipPw`tUM
z{8Qy|{xHXFzQ~l4C%oo1z+4t&*0P*b
zDY7R2hm*=nWH0lAo
zRo;x=dV%R;JQx4liGE^TU)7cG!F?Z)Us_JO;u+Uy-6XMzo4xl!Tjy25NXq;XNODj1
z5p=q$BW{;gqMD*>~!nf6lf+n_q7Y&
z+`fRf4>yqRyL@S6ro=R!u(RBA?I{HC
z+X=E#lc6zY$e^3$xjQo+^|pnTj$QxaQ1b7il%D{*-
z2H5&~#vC1G9j)-QS^yX)j?a&?e))#1%~*${N6J%OG}G|XmDxSL&M-k=Fb;zIMXeH9jjsAC5TO9snlsoH`S|QuH{Uv*F+XKN?;2ihk_ZV
zVn*&S$5yqfJEW-m%S$Wk;iY4?ze#_82il8(#Ky|Y%Xc$h!eis(F9lqzAUlCgx2km7
zC_9rOGHN$p{*c?uXsIXdaH*fP|%=w-14sIy761r@4CT=^hIlJF&a;SD4FeHK?7
z8)rK%PSF~tD*w5TXlZYIlX6k)5s5_7J-D1Z8jlGTlp{V(
zMS&ZfqjXLNeHl8*Xkjl7C>Yv{eQrXZRH1TP6JPJO2Yc9y1bhs`$w*K*UYJuS-dx`2
zIWm%5zQuW6=)SKoSZ;A+D|hKZOBt?Tnc`}ZFX9j06X_{~jH0;C*GsOk@
zUFlQCJcKNIhFXj#5Iyn{Mf;asOyAF`eDgd&c^dW^$-u$!wjad`=q)N>&n!c>t!dJi
z)4BM3n+DJP$Li;9K6MYQvSvnKnIGAFJT=Mo1^>*u)xz{=eOU(6H{x94U&ux4fgq>N
z2?uJzd~dZ2j$w8QTHpM@7OAt##)8RSVlO-F?zNOY)vG(4ByzcGt?N}3wT
zjfmAHOkhBK-xZ^r&sFcYCw6Ma=rrvV;xO&=kOOtO>(DQz-t`a0QfO${aS6E0xb~YR
zh|Z|0x;m-9qSy|KN(m7ACbOyCxrEhur3|+y8xGXYaVmE-gm~nogo0IHISI;(hT|5|
zo0FWzhPQ7F(q;X~Q=jLCw}USd;})?{Qh!v_l+%4)g=Z^~ons@)+i|113gGqh@uYE)U6m{9v!5Jc
z7Y=D1MyK~~XZmaoz7)$cyIMaGM@u;<27jrw;BVWBRzRs0W(`O>`wr`lnY&!jJMD1b
z^=%9!Gb5vGNbsuvQ!6*sEqwmFGz^0|T^^&X88zLIh@4P)8rr?%l{#AsqSj#I9W4Ym
z59`G%LD@~W_?o9{mB*C~4^WtM_}Q&R%kZeXR(YZLZrFfB_)G#KQ!ft%@CI?s%`cIkNp7xB)-;3v;}^
znxGaNe1;IVn9|j7vSDwl_P+eqT&I#&{e5rEt{&u9Zb+2wiv7YlxJBEYRrkO}-{f6-
zT{x3us2Th%Ahvv$ekwI*7i-uk^@99`#^?6rx3V$Rn#i-U_*#r+LHzI-r})&2c~RF(
z#Zj7Lqk(~1E&z?n5&PzB^HqoDg(xuD{1GR)-n)r(hwlNYH$adp^^G;o6TCq56y`J)
z!o8YN)cB7h_Bj#DQ!xNI5Xaui13h}XgU;h@F;RT+y6
z2;Dsxd3SJ+z2H-jyO^B_!=j&+Ok0zH1GKrTvG+V5GVvX@Z!0@r1}Xr{+z95H%kDps
zxP$h`Pvko+`1zWF9ok
z8eO##L-fWi9{LSEu8c>tx(TCF&Wq)$iTTQCGI_CK1c8Wi5_57bwWaK#W^7f_aA2-T
z=gHZi7el^_!>;TN&XebVH6_A5O%cvZ&^z#BmzcFp
zY6cj)G6_s~1y0QF)}G=Ew&$r5U+!dpV-F@G-#am$SpS?C|?d|d_K61Y9P(QmKX@n+_iy?nyE?kmhE~NUa-fR
zBAW@M8T<}^MhYzVRVSJ8&A&d0Xp05ObZYMeeziUjdd<90R@*v2^Xg^Z1&Jh*Hs39g}pvGc^2=UH!rM`U|f-aUXpT
zgz9G!mg+r|@yc*Js^Wzk#nkti+JU|q&qYBVc_Pbq_#78iXr5!)Kp<1A6<(T{J?xSFNW9oC2nt9uMkpOsRN?+Z@SI3SqYh~hh9Z;+BZ3!O2L?U3{
za9qH4j+$2ywW)L^X6DLGz7%%L16
zm#pZn&H=Y3>5c8N07eU*vC%03T-b~z^G6e)uojAfWvO_W%s@h~THt%SZ@yEG`y+O<
zJLuF45()30w?h3P3Yq~>!>Mm5{Hi(GT&?{4Z-7hB)$otPYeUO=B2;nnD2{mu#Xu~5
zdGZq5^vW-y{#UJgyMV~~P@ikb-F~t3KfsmnxOziy;i&~?U1XDZb0d8hhQ)M>Ws~@-
z!?fKu4LI3FZD;ILsrB%9QlkF8UmX22gM|{5i-s)6)piH-UpKv&wU09DT(~`jiSlxC
zW+Y5P|F0L)jsIW6uSaFj0ORqOMG*cNL}W#cmM|__G(-mF`}q5JKvEXc{}*h|=*Xn5
z6YNd5@ejx)V0-RVwyPW0Q$8e=gI`4tAV+>daG}3T`ll+LpKU&SFLJUf(Hr$Y4jWID
zhh-?tXJsPis}fC3|2s20zwq!EqTlnE80DITC-beN`HQ5@q9Z2<1@gGrA
S^;N&-Y<0!%a@D0f$^Qq8em`IU
literal 0
HcmV?d00001
diff --git a/h-and-m-fash-rec-kaggle-competition/images/kale_cell_metadata.PNG b/h-and-m-fash-rec-kaggle-competition/images/kale_cell_metadata.PNG
new file mode 100644
index 0000000000000000000000000000000000000000..b43addab22622ebd566d99a2128e24a6b3be3c0c
GIT binary patch
literal 18886
zcmcG$cOcdA|36B~Job!Z?`+w771=V9nU%dIn{qhzh>Xl+CnG|#*C8{qg>#5x6OP&M
zb^3h1zwf>O-2d($j&^i3#Zmv9Pd+A84pO#KOXk1YgqlxZp27;@a5Y
z8@BI5brr1YQRpW4<))*ujxrWjT{6+7%`NadftSW(Uo0#NKg<{Q(o^ODEUX9S57d;6
z0<3oO@Uo1JGaA1XI9>+^Jq>bmgqE2vNN6|BuS2_F(l#pk!-QNA$XtS<-RI3pRu6y`1~_tA6#($Kn^n5-TP#AKe>BYQGU|8Y{=
z2w!G#7o&3XCXaH1clVuUsx7nRgZ%^6vmLeRJX#O>_fHl)H)54-IB}+l13jH)8+Kh~
ztwvLy>&C--mAKM4yDy)QiscLaJ~29S=j6qmZF#n(lFTIEU6N@F(R05efgI5sKX(sjZ`!{V%E7mmAsLMzTxi8=-ulm$kXx
z@6*Vh&Z3^(a(8^=(%#O|hlOYlyE@Mn$N`aoyd)@#!s#@(V*w$g7^3lQzufiyc;HzOI9(UIHT@xQD