Add 'Move data into the cloud' example

falknerdominik · falknerdominik · commit 031f0ba84419 · 2020-02-03T17:23:30.000+01:00
diff --git a/03 - Run Luigi in Kubernetes/01 - Move data into the cloud/preprocessing.py b/03 - Run Luigi in Kubernetes/01 - Move data into the cloud/preprocessing.py
@@ -0,0 +1,27 @@
+"""
+This module includes an example preprocessing step.
+"""
+
+import pandas as DataFrame
+
+
+def drop_nan_columns(data: DataFrame) -> DataFrame:
+    """
+    Drop all columns with more than 80percent missing values.
+
+    :param data: Input DataFrame which should be preprocessed.
+    :return: DataFrame where columns with more than 80 percent missing values are deleted.
+    """
+    data = data.dropna(axis=1, thresh=(len(data)*80)/100)
+    return data
+
+
+def drop_duplicates(data: DataFrame) -> DataFrame:
+    """
+    Drop duplicated rows and columns.
+
+    :param data: Input DataFrame which should be preprocessed.
+    :return: DataFrame where columns with more than 80 percent missing values are deleted.
+    """
+    data = data.drop_duplicates()
+    return data
diff --git a/03 - Run Luigi in Kubernetes/01 - Move data into the cloud/requirements.txt b/03 - Run Luigi in Kubernetes/01 - Move data into the cloud/requirements.txt
@@ -0,0 +1,10 @@
+docutils==0.15.2
+lockfile==0.12.2
+luigi==2.8.10
+numpy==1.18.0
+pandas==0.25.3
+python-daemon==2.1.2
+python-dateutil==2.8.1
+pytz==2019.3
+six==1.13.0
+tornado==5.1.1
diff --git a/03 - Run Luigi in Kubernetes/01 - Move data into the cloud/simple_workflow.py b/03 - Run Luigi in Kubernetes/01 - Move data into the cloud/simple_workflow.py
@@ -0,0 +1,59 @@
+"""Preprocessing example to show how luigi works (only one preprocessing step will be executed!)."""
+from typing import Generator
+
+import luigi
+import pandas as DataFrame
+import pandas as pd
+from luigi.contrib.azureblob import AzureBlobTarget, AzureBlobClient
+
+from preprocessing import drop_nan_columns
+
+
+class Preprocess(luigi.Task):
+    """
+    Applies general preprocessing steps to all CSV files loaded.
+    """
+    gist_input_url: str = luigi.Parameter()
+    connection_string: str = luigi.Parameter()
+    filename: str = luigi.Parameter()
+
+    def run(self):
+        # read data from url
+        data_in: DataFrame = pd.read_csv(self.gist_input_url, sep=";")
+        data_preprocessed = drop_nan_columns(data_in)
+
+        # write contents to azure blob file
+        with self.output().open("w") as output_file:
+            data_preprocessed.to_csv(output_file)
+
+    def output(self) -> luigi.Target:
+        # save the output in the azure blob storage
+        # noinspection PyTypeChecker
+        return AzureBlobTarget(
+            container=r'clcstoragecontainer',
+            blob=self.filename,
+            client=AzureBlobClient(
+                connection_string=self.connection_string)
+        )
+
+
+class PreprocessAllFiles(luigi.WrapperTask):
+    """
+    Applies defined preprocessing steps to all files in the selected folder.
+    """
+    # gist where the CSV files are stored
+    gist_url = 'https://gist.githubusercontent.com/falknerdominik/425d72f02bd58cb5d42c3ddc328f505f/raw/4ad926e347d01f45496ded5292af9a5a5d67c850/'
+    # connection string obtained for the storage unit via azure
+    azure_connection_string = '<INSERT-AZURE-CONNECTION-STRING>'
+
+    def requires(self) -> Generator[luigi.Task, None, None]:
+        for filename in ['test_file1.CSV', 'test_file2.CSV']:
+            yield Preprocess(
+                gist_input_url=f'{self.gist_url}{filename}',
+                filename=filename,
+                connection_string=self.azure_connection_string,
+            )
+
+
+if __name__ == "__main__":
+    luigi.build([PreprocessAllFiles()], local_scheduler=True)
diff --git a/03 - Run Luigi in Kubernetes/Readme.md b/03 - Run Luigi in Kubernetes/Readme.md
@@ -0,0 +1,7 @@
+# 03 - Run Luigi in Kubernetes
+
+This module will run you through the steps necessary to work with luigi in kubernetes.
+
+1. [Move Data into the cloud]()
+2. [Use Kubernetes API]()
+3. [Inspect results]()