diff --git a/.gccp.ini b/.gccp.ini deleted file mode 100644 index b8d65388b..000000000 --- a/.gccp.ini +++ /dev/null @@ -1,12 +0,0 @@ -[PROJECT] -version = 0.10.0 -project = storied-landing-366912 -pipelines = bitbucket -terraform = True -artifact_registry = True -git_hooks = False -terraform_path = terraform -zone = europe-west1-b -vpc_connector = True -wif = False - diff --git a/.gitignore b/.gitignore index 836a6ba72..b15db45aa 100644 --- a/.gitignore +++ b/.gitignore @@ -48,9 +48,36 @@ Thumbs.db *.mov *.wmv -# Virtual environments -*venv -*.venv +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# DS_Store (MacOS) +.DS_Store # Poetry *.lock \ No newline at end of file diff --git a/README.md b/README.md index c2dc18501..4d0e4092b 100644 --- a/README.md +++ b/README.md @@ -1,184 +1,91 @@ -# Express pipelines - -The express pipelines consists of two main separate kubeflow pipelines: - -* [Dataset Creation Pipeline](#1-dataset-creation-pipeline): Created the dataset that will be used - for finetuning the stable diffusion model -* [Stable Diffusion Finetuning pipeline](#2-sd-finetuning-pipeline): Finetunes a pretrained stable - diffusion model on the created dataset. +# Express -The separation between the two pipelines enables to run them disjointly to allow for quick -and fast experimentation; -the dataset creation pipeline -will, in practice, only be run once in a while to collect large amount of images. Whereas -the stable diffusion finetuning pipeline might be run several -time to allow experimenation -with different hyperparameters or to resume training from a certain checkpoint. - -Check out this [page](mlpipelines) for more information on kubeflow pipelines. +Express is a framework that speeds up the creation of KubeFlow pipelines to process big datasets and train [Foundation Models](https://fsi.stanford.edu/publication/opportunities-and-risks-foundation-models) +such as: -## [1. Dataset Creation pipeline](mlpipelines/pipelines/dataset_creation_pipeline.py) - -### Description +- Stable Diffusion +- CLIP +- Large Language Models (LLMs like GPT-3) + +on them. + +## Installation + +Express can be installed using pip: + +``` +pip install express +``` + +## Usage + +Express is built upon [KubeFlow](https://www.kubeflow.org/), a cloud-agnostic framework built by Google to orchestrate machine learning workflows on Kubernetes. An important aspect of KubeFlow are pipelines, which consist of a set of components being executed, one after the other. This typically involves transforming data and optionally training a machine learning model on it. Check out [this page](https://www.kubeflow.org/docs/components/pipelines/v1/concepts/) if you want to learn more about KubeFlow pipelines and components. + +Express offers ready-made components and helper functions that serve as boilerplate which you can use to speed up the creation of KubeFlow pipelines. To implement your own component, simply overwrite one of the components available in Express. In the example below, we leverage the `PyArrowTransformComponent` and overwrite its `transform` method. -The aim of this pipeline is to prepare a dataset to finetune a pre-trained Stable Diffusion on. - -In short, -the pipeline first loads in reference dataset containing curated images of a certain style/domain -(e.g. clip art images) and then retrieves similar images to this dataset from -the [LAION dataset](https://laion.ai/), a large scale public -dataset containing around 5 billion images. The retrieved images then -undergo additional filtering -steps to ensure that they are of good quality (e.g. single component, clean-cut). Finally, the remaining images -are then captioned using a captioning model to generate image/caption pairs for stable diffusion -finetuning. - -knn - -### Pipeline steps - -The pipeline consists of the following components: - -**[1) Dataset loader component:](mlpipelines/components/dataset_loader_component)** This component -loads in an image dataset from a specific -[Google Cloud Storage](https://cloud.google.com/storage/docs) path and creates a -parquet files with relevant metadata (image path, format, size, ...). - -**[2) Image filter component:](mlpipelines/components/image_filter_component)** This component is -used to filter the images based on the metadata -attributes to only keep images of a certain size and format. - -**[3) Image conversion component:](mlpipelines/components/image_conversion_component)** This -component converts images from different selected formats -(currently only `png` and `svg`) to `jpg` images. This step is necessary since other images formats -are not suitable for training ML model on and often contain artifacts that can be eliminated -during conversion. - -**[4) Image embedding component:](mlpipelines/components/image_embedding_component)** This component -extracts [image embeddings](https://rom1504.medium.com/image-embeddings-ed1b194d113e) -from the converted images using -a [CLIP model](https://www.google.com/search?q=clip+embeddings&oq=clip+embeddings&aqs=chrome..69i57j0i22i30j69i60j69i64l2j69i60j69i64j69i60.6764j0j7&sourceid=chrome&ie=UTF-8). -Since image embeddings are good at capturing the features of the image in a compact and useful way, -it -will be used in the next steps to retrieve images similar to our seed images. - -[**5) Clip retrieval component:**](mlpipelines/components/clip_retrieval_component) This component -retrieves images from the LAION dataset using a clip -retrieval system. The images are retrieved using an efficient index built from the previously -extracted embeddings that enables for fast and accurate -querying against a large database. Checkout -this [link](https://github.com/rom1504/clip-retrieval) for information on clip retrieval. You can -also -test out clip retrieval in -this [demo](https://rom1504.github.io/clip-retrieval/?back=https%3A%2F%2Fknn5.laion.ai&index=laion5B&useMclip=false). -The output of this component is a list of URLs containing the link to the retrieved image. - -[**6) Clip downloader component:**](mlpipelines/components/clip_downloader_component) This component -downloads the images from the list of URLs and -creates the corresponding datasets. It uses the -following [library](https://github.com/rom1504/img2dataset) -for efficient and fast image download (multi-thread approach). The images are also filtered (based -on size -and area), resized and converted during download. - -[**7) Image classifier component:**](mlpipelines/components/image_classifier_component) This -component implements two different classifiers to filter out -the retrieved images: - -* _Single component classifier:_ uses classical computer vision algorithm implementation ( - flood-fill, edge detection) - to ensure that the component we extracted are a single component. - - -* _Clean-cut classifier_: uses a pre-trained ML classifier that is trained to identify clean cut - images. - -[**8) Image caption component:**](mlpipelines/components/image_caption_component) This component -uses a captioning -model ([BLIP](https://github.com/salesforce/BLIP)) -to caption the final filtered images for training. - -## [2. SD finetuning pipeline](mlpipelines/pipelines/sd_finetuning_pipeline.py) - -### Description - -This pipeline aims to finetune a pre-trained stable diffusion model on the collected dataset. - -### Pipeline steps - -This pipeline consists only of a single component: - -**[1) SD finetuning component:](mlpipelines/components/sd_finetuning_component)** takes as input the -final data manifest that is output by the dataset -creation pipeline. The data -manifest keeps reference of all the necessary metadata is required for the next step such as the -reference to the filtered images and captions. The component prepares the dataset for training -according to the -required [format](https://huggingface.co/docs/datasets/image_dataset#:~:text=in%20load_dataset.-,Image%20captioning,-Image%20captioning%20datasets) -and starts the finetuning jobs. - -## **Data Manifest: a common approach to simplify different steps throughout the pipeline** -In order to keep track of the different data sources, we opt for a manifest-centered approach where -a manifest is simply a JSON file that is passed and modified throughout the different steps of the pipeline. - -```json -{ - "dataset_id":"-", - "index":"", - "associated_data":{ - "dataset":{ - "namespace_1":"", - "...":"" - }, - "caption":{ - "namespace_1":"", - "...":"" - }, - "embedding":{ - "namespace_1":"", - "commit_hash":"", - "creation_date":"", - "run_id":"" - } -} ``` -Further deep dive on some notations: +import pyarrow as pa +from pyarrow.dataset import Scanner + +from express.components.pyarrow_components import PyArrowTransformComponent, PyArrowDataset, PyArrowDatasetDraft + +class MyFirstTransform(PyArrowTransformComponent): + @classmethod + def transform(cls, data: PyArrowDataset, extra_args: Optional[Dict] = None) -> PyArrowDatasetDraft: + + # Reading data + index: List[str] = data.load_index() + my_data: Scanner = data.load("my_data_source") + + # Transforming data + table: pa.Table = my_data.to_table() + df: pd.DataFrame = table.to_pandas() + # ... + transformed_table = pa.Table.from_pandas(df) + + # Returning output. + return data.extend() \ + .with_index(in) \ + .with_data_source("my_transformed_data_source", \ + Scanner.from_batches(table.to_batches()) +``` + +## Components zoo + +Available components include: + +- Non-distributed PyArrow components: `express.components.pyarrow_components.{PyArrowTransformComponent, PyArrowLoaderComponent}` + - Data is exposed as a `pyarrow.dataset.Scanner`. Depending on the use-case, consumers can do batch transforms, or collect data in-memory to a pyarrow `Table` or pandas `DataFrame`. + +Planned components include: -* **namespace:** the namespace is used to identify the different data sources. For example, you can give -your seed images a specific namespace (e.g. `seed`). Then, the images retrieved with clip-retrieval will -have different namespace (e.g. `knn`, `centroid`). +- Spark-based components and base image. +- Pandas-based components. +- HuggingFace Datasets components? -* **index**: the index denotes a unique index for each images with the format (e.g. `seed_00010`). -It indexes all the data sources in `associated_data`. -**Note**: the index keeps track of all the namespace (e.g. [`seed_00010`,`centroid_0001`, ...]) +With Kubeflow, it's possible to share and re-use components across different pipelines. To see an example, checkout this [sample notebook](https://github.com/Svendegroote91/kfp_samples/blob/master/Reusable%20Components%20101.ipynb) that showcases how you can save and load a component. -* **dataset**: a set of parquet files for each namespace that contain relevant metadata -(image size, location, ...) as well as the index. +Note that Google's [AI Hub](https://aihub.cloud.google.com) also contains components that you can easily re-use. Some interesting examples: -* **caption**: a set of parquet files for each namespace that contain captions -image captions as well as the index. +- [Gather training data by querying BigQuery](https://aihub.cloud.google.com/p/products%2F4700cd7e-2826-4ce9-a1ad-33f4a5bf7433) +- [Bigquery to TFRecords converter](https://aihub.cloud.google.com/p/products%2F28a006d0-c833-4c68-98ff-37358eeb7726) +- [Executing an Apache Beam Python job in Cloud Dataflow](https://aihub.cloud.google.com/p/products%2F44999f4a-1668-4d42-a4e3-1269a8786840) +- [Submitting a Cloud ML training job as a pipeline step](https://aihub.cloud.google.com/p/products%2Ffbe29250-9b67-4dfb-8900-d6ce41cdb85a) +- [Deploying a trained model to Cloud Machine Learning Engine](https://aihub.cloud.google.com/p/products%2F7a08de6c-3864-4ccf-8151-4119e1b4e890) +- [Batch predicting using Cloud Machine Learning Engine](https://aihub.cloud.google.com/p/products%2F3d5d2340-0eb2-4b03-aecc-ae34f6105822) -* **metadata**: Helps keep track of the step that generated that manifest, code version and pipeline run id. +## Pipeline zoo -The Express pipeline consists of multiple steps defines as **Express steps** that are repeated -throughout the pipeline. The manifest pattern offers the required flexibility to promote its reuse and avoid -duplication of data sources. For example: +To do: add ready-made pipelines. -* **Data filtering** (e.g. filtering on image size): add new indices to the `index` but retain associated data. +## Examples -* **Data creation** (e.g. clip retrieval): add new indicies to the new `index` and another source of data under associated data with a new namespace. +Example use cases of Express include: -* **Data transformation** (e.g. image formatting): retain indices but replace dataset source in `dataset`. +- collect additional image-text pairs based on a few seed images and fine-tune Stable Diffusion +- filter an image-text dataset to only include "count" examples and fine-tune CLIP to improve its counting capabilities +Check out the [examples folder](examples) for some illustrations. -### Future updates -Future iterations on the Express pipelines will include the implementations of the [common express -component](mlpipelines/common) to simplify the implementations of many of the recurring operations -across components such as data loading, data transformation and manifest update. diff --git a/bitbucket-pipelines.yml b/bitbucket-pipelines.yml deleted file mode 100644 index 88504bea2..000000000 --- a/bitbucket-pipelines.yml +++ /dev/null @@ -1,45 +0,0 @@ -image: python:3.8 -definitions: - steps: - - step: &git-sync - name: git-sync - script: - - git checkout cf-shared - - git pull - - git remote add sync git@github.com:creativefabrica/express-pipelines.git - - git fetch sync - - git push -f sync cf-shared:dev - - step: &bandit - name: bandit - caches: - - pip - script: - - pip install $(cat requirements.txt | grep bandit) - - bandit -c pyproject.toml -r . - - step: &pylint - name: pylint - caches: - - pip - script: - - pip install $(cat requirements.txt | grep pylint) - - python pylint.py --modules-file pylint-modules - - step: &license-check - name: license-check - script: - - pip install --upgrade pip - - pip install $(cat requirements.txt | grep liccheck) - - ./run_license_check.sh --modules-file pylint-modules - -pipelines: - branches: - cf-shared: - - parallel: - - step: *git-sync - - step: *bandit - - step: *pylint - - step: *license-check - default: - - parallel: - - step: *bandit - - step: *pylint - - step: *license-check diff --git a/mlpipelines/common/README.md b/docs/README.md similarity index 51% rename from mlpipelines/common/README.md rename to docs/README.md index bf120fdb6..4fed20e4f 100644 --- a/mlpipelines/common/README.md +++ b/docs/README.md @@ -1,93 +1,102 @@ -# Express Components - Common +# Docs -Manifest handling, dataset loading and writing are moderate-complexity recurring patterns across different express components. +This file contains general documentation on Express. -To make implementing express components as lightweight as possible, Express provides a python package and base docker image that takes care of this heavy lifting, and makes it easy to implement many data transformations out of the box. +## Express: built around manifests +Express provides helper functions and boilerplate to speed up the creation of KubeFlow components and pipelines. -## 1. Concepts +### Helpers -### 1.a) DataManifests, ExpressDatasets and ExpressDatasetDrafts -A **DataManifest** is a json file that describes the location and contents of different data sources. It can be seen as the recipe for a dataset. - -An **ExpressDataset** is a wrapper object around the manifest that implements the data access logic, and exposes methods to read data from specific data sources. +Express contains a few helper modules: -After transforming the input data (see below), an **ExpressDatasetDraft** creates a plan for an output dataset / manifest, by specifying which data sources to retain from the input, which to replace with locally created data, and which new data sources to create. At the end of a component run, local data will be uploaded and an output manifest will be created. - -![Figure 1. Relation between different dataset concepts](diagrams/data-flow.png) - -### 1.b) Transforms and Loaders -The most common type of component in Express is an **ExpressTransformComponent**, which takes an ExpressDataset and an optional dict of arguments as input and returns an ExpressDatasetDraft of transformed output data. +- `io.py`: general I/O helper functions. +- `kfp.py`: include helper functions for GPU logging when running a KFP component and parsing specific KFP input. +- `logger.py`: general logger module for event logging. +- `manifest.py`: Defines the structure of the data manifest that holds the location and contents of the different data sources. +- `storage_helpers.py`: helper functions to interact with Google Cloud Storage (blob listing, I/O, ...) -However, at the start of a new pipeline, you won't yet have any express datasets to transform. Instead, an express pipeline can use an **ExpressLoaderComponent** as entry-point, which only takes the optional dict of arguments to construct an ExpressDatasetDraft. For example, the arguments could specify an external data location and how to interpret it, after which a loader job can create a first ExpressDataset. +Those helper functions can be used when creating components. +### Component base classes -## 2. Usage +The `express.components` module contains component base classes which you can overwrite to create components. Each of these component classes always have 2 variants; a loader and a transform version. -To implement your own Transform component, you'll need to take a dependency on the `express_components` package and subclass one of the TransformComponent base classes. +**Available implementations** +1. Non-distributed Pandas implementation.\ + - `express_components.pandas_components.{PandasTransformComponent, PandasLoaderComponent}` + - Data is exposed as a Pandas `DataFrame`. Depending on the use-case, consumers can do batch transforms, or collect data in-memory to a Pandas `DataFrame`. -### 2.a) General flow -The below example uses the PyArrow base classes. +**Planned implementations** +1. Spark-based components and base image. -#### I. Subclass one of the TransformComponent/LoaderComponent base classes and implement the transform method. +To implement your own Transform component, you'll need to take a dependency on the `express` package and subclass one of the TransformComponent base classes. ```python -from express_components.pandas_components import PandasTransformComponent, PandasDataset, - PandasDatasetDraft - +from express.components.pandas_components import PandasTransformComponent, PandasDataset, PandasDatasetDraft class MyFirstTransform(PandasTransformComponent): @classmethod - def transform(cls, data: PandasDataset, - extra_args: Optional[Dict[str, Union[str, int, float, bool]]] = None) -> PandasDatasetDraft: + def transform(cls, data: PandasDataset, extra_args: Optional[Dict] = None) -> PandasDatasetDraft: + # Reading data index: pd.Series = data.load_index() my_data: pd.DataFrame = data.load("my_data_source") # filter index index = index.filter(items:) + # Transforming data my_data = my_data.apply() - return data.extend() \ + return data.extend() \ .with_index(index) \ .with_data_source("my_transformed_data_source", my_data) - ``` -#### II. Implement Docker entrypoint +Next, you need to implement the Docker entrypoint: ``` if __name__ == '__main__': MyFirstTransform.run() ``` -### 2.b) Taking a dependency on express_components -There are two ways to add `express_components` to your dependencies. +#### Taking a dependency on express +There are two ways to add `express` to your dependencies. -1. (Recommended) Build the `common` docker image, and have your component use this as a base image. This base image will include the `express_components` python package, and itself extends a PyTorch GPU image. -2. `express_components` can be installed as a standalone Python package into your custom images. See the Dockerfile of the `common` base image for an example implementation. +1. (Recommended) Build the `common` docker image, and have your component use this as a base image. This base image will include the `express` python package, and itself extends a PyTorch GPU image. +2. `express` can be installed as a standalone Python package into your custom images. See the Dockerfile of the `common` base image for an example implementation. -### 2.c) Pick an ExpressTransformerComponent / ExpressLoaderComponent base implementation to subclass +### Adding additional Transform/Loader base implementations -Different implementation mainly differ in how they expose the data, and what data manipulation capabilities are exposed at runtime. +Different implementations mainly differ in how they expose the data, and what data manipulation capabilities are exposed at runtime. -**Available implementations** -1. Non-distributed pyarrow implementation.\ - - `express_components.pyarrow_components.{PyArrowTransformComponent, PyArrowLoaderComponent}` - - Data is exposed as a `pyarrow.dataset.Scanner`. Depending on the use-case, consumers can do batch transforms, or collect data in-memory to a pyarrow `Table` or pandas `DataFrame`. +If you want a different data manipulation runtime, use different data structures, or do other component-level bootstrapping across multiple jobs, you could add another base implementation for the `ExpressTransformComponent` / `ExpressLoaderComponent`. -**Planned implementations** -1. Spark-based components and base image. +A general overview of the different implementation levels can be seen in Figure 2. Additional base implementations work on the middle layer, and mainly affect data loading / writing logic. +![Figure 2. Express component class hierarchy](class-hierarchy.png) -## 3. Adding additional Transform/Loader base implementations -If you want a different data manipulation runtime, use different data structures, or do other component-level bootstrapping across multiple jobs, you could add another base implementation for the `ExpressTransformComponent` / `ExpressLoaderComponent`. +More specifically, you'll need to subclass the `ExpressDatasetHandler` mix-in and implement the abstract dataset reading / writing methods. -A general overview of the different implementation levels can be seen in Figure 2. Additional base implementations work on the middle layer, and mainly affect data loading / writing logic. +Look at `express_components/pyarrow_components.py` for an example implementation. -![Figure 2. Express component class hierarchy](diagrams/class-hierarchy.png) +## Express concepts + +Manifest handling, dataset loading and writing are moderate-complexity recurring patterns across different express components. + +To make implementing express components as lightweight as possible, Express provides a python package and base docker image that takes care of this heavy lifting, and makes it easy to implement many data transformations out of the box. -More specifically, you'll need to subclass the ExpressDatasetHandler mix-in and implement the abstract dataset reading / writing methods. +### 1.a) DataManifests, ExpressDatasets and ExpressDatasetDrafts +A **DataManifest** is a JSON file that describes the location and contents of different data sources. It can be seen as the recipe for a dataset. + +An **ExpressDataset** is a wrapper object around the manifest that implements the data access logic, and exposes methods to read data from specific data sources. + +After transforming the input data (see below), an **ExpressDatasetDraft** creates a plan for an output dataset / manifest, by specifying which data sources to retain from the input, which to replace with locally created data, and which new data sources to create. At the end of a component run, local data will be uploaded and an output manifest will be created. + +![Figure 1. Relation between different dataset concepts](data-flow.png) + +### 1.b) Transforms and Loaders +The most common type of component in Express is an **ExpressTransformComponent**, which takes an `ExpressDataset` and an optional dict of arguments as input and returns an `ExpressDatasetDraft` of transformed output data. -Look at `express_components/pyarrow_components.py` for an example implementation. \ No newline at end of file +However, at the start of a new pipeline, you won't yet have any express datasets to transform. Instead, an express pipeline can use an **ExpressLoaderComponent** as entry-point, which only takes the optional dict of arguments to construct an ExpressDatasetDraft. For example, the arguments could specify an external data location and how to interpret it, after which a loader job can create a first `ExpressDataset`. \ No newline at end of file diff --git a/mlpipelines/common/diagrams/class-hierarchy.png b/docs/class-hierarchy.png similarity index 100% rename from mlpipelines/common/diagrams/class-hierarchy.png rename to docs/class-hierarchy.png diff --git a/mlpipelines/common/diagrams/data-flow.png b/docs/data-flow.png similarity index 100% rename from mlpipelines/common/diagrams/data-flow.png rename to docs/data-flow.png diff --git a/docs/dataset_creation_pipeline.png b/docs/dataset_creation_pipeline.png deleted file mode 100644 index 67b30971f..000000000 Binary files a/docs/dataset_creation_pipeline.png and /dev/null differ diff --git a/mlpipelines/README.md b/docs/kubeflow_101.md similarity index 59% rename from mlpipelines/README.md rename to docs/kubeflow_101.md index 0a03ee31e..1e3afc753 100644 --- a/mlpipelines/README.md +++ b/docs/kubeflow_101.md @@ -1,24 +1,6 @@ -# Kubeflow Pipelines +## Using KubeFlow on GCP -This section contains information for getting started with Kubeflow -Pipelines and documents best practices on how to use Kubeflow Pipelines. - -## Kubeflow Pipelines context -Please checkout this [page](https://www.kubeflow.org/docs/components/pipelines/v1/concepts/) to learn about important concept of kubeflow. - -## Layout - -``` -project root -+-- mlpipelines - +-- components - +-- pipelines - +-- config - +-- helpers - +-- upload.py -``` - -## Connecting the Kubeflow Pipelines UI +### Connecting the Kubeflow Pipelines UI There are two ways to connect to KFP UI, first make sure you autheticate to the GKE cluster hosting KFP @@ -71,16 +53,14 @@ There are multiple steps for setting up the component: - Specify the requirement packages of the component in the `requirements.txt` file and specify the build steps in the `Dockerfile`. -- Finally, once you're done, you're ready to build the images and push it to the [Artifact Registry](https://cloud.google.com/artifact-registry), -you can do so by running script (`sh build_image.sh`). Make sure to enable -[kaniko](https://cloud.google.com/build/docs/optimize-builds/kaniko-cache) for faster container build time. - +- Finally, once you're done, you're ready to build the images and push it to the [Artifact Registry](https://cloud.google.com/artifact-registry). You can do so by running the script (`sh build_image.sh`). Make sure to enable [kaniko](https://cloud.google.com/build/docs/optimize-builds/kaniko-cache) for faster container build time. More information about the structure of a component can be found in this [section](example_components/example_component). + ## Pipelines Finally, once you have built all the separate components, you are ready to compile and upload the -pipeline. The `upload.py` contains functions to help with the compilation and upload of new pipelines. +pipeline. The `upload.py` script contains functions to help with the compilation and upload of new pipelines. [Here](pipelines/example_pipeline.py) is a simple example demonstrate a toy pipeline, you can see how parameters are passed between different steps of the pipeline. To compile the pipeline, simply execute the file, the pipeline will be uploaded to kubeflow under a specific version @@ -104,24 +84,4 @@ that typically only require a few lines of Python code. Instead of having to containerize this small Python function, you can simply 'mount' it on top of an existing image. -To get some experience with these lightweight components, -you can checkout -some [sample notebooks](https://github.com/Svendegroote91/kfp_samples) -that highlight their main functionality. - -## Re-usable and pre-made components - -With Kubeflow Pipelines it's possible to re-use components across different -pipelines. To see an example, checkout this [sample notebook](https://github.com/Svendegroote91/kfp_samples/blob/master/Reusable%20Components%20101.ipynb) -that showcases how you can save and load a component. - -The [AI Hub](https://aihub.cloud.google.com) also contains Kubeflow Pipelines -components that you can -easily re-use. Some interesting examples: - -- [Gather training data by querying BigQuery](https://aihub.cloud.google.com/p/products%2F4700cd7e-2826-4ce9-a1ad-33f4a5bf7433) -- [Bigquery to TFRecords converter](https://aihub.cloud.google.com/p/products%2F28a006d0-c833-4c68-98ff-37358eeb7726) -- [Executing an Apache Beam Python job in Cloud Dataflow](https://aihub.cloud.google.com/p/products%2F44999f4a-1668-4d42-a4e3-1269a8786840) -- [Submitting a Cloud ML training job as a pipeline step](https://aihub.cloud.google.com/p/products%2Ffbe29250-9b67-4dfb-8900-d6ce41cdb85a) -- [Deploying a trained model to Cloud Machine Learning Engine](https://aihub.cloud.google.com/p/products%2F7a08de6c-3864-4ccf-8151-4119e1b4e890) -- [Batch predicting using Cloud Machine Learning Engine](https://aihub.cloud.google.com/p/products%2F3d5d2340-0eb2-4b03-aecc-ae34f6105822) +To get some experience with these lightweight components,you can checkout some [sample notebooks](https://github.com/Svendegroote91/kfp_samples) that highlight their main functionality. \ No newline at end of file diff --git a/mlpipelines/components/base_component/docs/manifest_example.png b/docs/manifest_example.png similarity index 100% rename from mlpipelines/components/base_component/docs/manifest_example.png rename to docs/manifest_example.png diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 000000000..1b06bb958 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,3 @@ +# Examples + +This folder contains examples that illustrate Express to speed up the creation of KubeFlow components and pipelines. diff --git a/examples/components/single_node/README.md b/examples/components/single_node/README.md new file mode 100644 index 000000000..7308db7d3 --- /dev/null +++ b/examples/components/single_node/README.md @@ -0,0 +1,15 @@ +# Single node data loading + +This folder contains an example script showcasing data loading with Express on a single node. Make sure to install express +with the `poetry install` command from the root of the repository. + +# Usage + +Run with the following commad: + +``` +python3 data_loading.py --extra-args '{"project_id": "storied-landing-366912"}' \ + --output-manifest \ + --metadata-args '{"run_id":"test","component_name":"test_component", \ + "artifact_bucket":"storied-landing-366912-kfp-output"} +``` \ No newline at end of file diff --git a/mlpipelines/common/examples/single_node/data_loading.py b/examples/components/single_node/data_loading.py similarity index 76% rename from mlpipelines/common/examples/single_node/data_loading.py rename to examples/components/single_node/data_loading.py index e0029c5a4..5577a038c 100644 --- a/mlpipelines/common/examples/single_node/data_loading.py +++ b/examples/components/single_node/data_loading.py @@ -1,24 +1,12 @@ -"""Example script showcasing data loading with Express. Make sure to install express with -`setup.py develop` command -run with command -`python3 data_loading.py --extra-args '{"project_id": "storied-landing-366912"}' \ - --output-manifest \ - --metadata-args '{"run_id":"test","component_name":"test_component", \ - "artifact_bucket":"storied-landing-366912-kfp-output"}' - -""" from typing import Optional, Dict, Union - import pandas as pd -from express_components.pandas_components import PandasLoaderComponent, PandasDatasetDraft -from express_components.helpers.logger import configure_logging - +from express.components.pandas_components import PandasLoaderComponent, PandasDatasetDraft +from express.logger import configure_logging # pylint: disable=too-few-public-methods class SeedDatasetLoader(PandasLoaderComponent): """Class that inherits from Pandas data loading """ - @classmethod def load(cls, extra_args: Optional[Dict[str, Union[str, int, float, bool]]] = None) -> PandasDatasetDraft: """ @@ -30,35 +18,25 @@ def load(cls, extra_args: Optional[Dict[str, Union[str, int, float, bool]]] = No PandasDatasetDraft: a dataset draft that creates a plan for an output datasets/manifest """ configure_logging() - # 1) Create an example index index_list = ['index_1', 'index_2', 'index_3', 'index_4'] - # 2) Create example datasources # 2.1.1) metadata metadata = {'index': index_list, 'uri': ['uri_1', 'uri_2', 'uri_3', 'uri4'], 'size': [300, 400, 500, 600], 'format': ['jpeg', 'jpeg', 'jpeg', 'jpeg']} - df_metadata = pd.DataFrame(metadata).set_index('index') - # 2.1.2) Caption] - captions = {'index': index_list, 'captions': ['dog', 'cat', 'bear', 'duck']} - df_captions = pd.DataFrame(captions).set_index('index') - # 2.2) Create data_source dictionary data_sources = {"metadata": df_metadata, "captions": df_captions} - # Create dataset draft from index and additional data sources dataset_draft = PandasDatasetDraft(index=df_metadata.index, data_sources=data_sources) - return dataset_draft - - + if __name__ == '__main__': - SeedDatasetLoader.run() + SeedDatasetLoader.run() \ No newline at end of file diff --git a/examples/pipelines/finetune_stable_diffusion/README.md b/examples/pipelines/finetune_stable_diffusion/README.md new file mode 100644 index 000000000..80bc0f13d --- /dev/null +++ b/examples/pipelines/finetune_stable_diffusion/README.md @@ -0,0 +1,150 @@ +# Pipelines for fine=tuning Stable Diffusion + +This folder contains 2 separate KubeFlow pipelines: + +* [Dataset Creation Pipeline](#1-dataset-creation-pipeline): creates the dataset that will be used + to finetune the Stable Diffusion model +* [Stable Diffusion Finetuning pipeline](#2-sd-finetuning-pipeline): finetunes a pretrained Stable + Diffusion model on the created dataset. + +The separation between the two pipelines enables to run them disjointly to allow for quick +and fast experimentation; the dataset creation pipeline will in practice only be run once in a while to collect large amount of images. The Stable Diffusion finetuning pipeline on the other hand might be run several times to allow experimentation +with different hyperparameters or to resume training from a certain checkpoint. + +## [1. Dataset Creation pipeline](pipelines/dataset_creation_pipeline.py) + +### Description + +The aim of this pipeline is to prepare a dataset to finetune a pre-trained Stable Diffusion on, based on a set of seed images. + +In short, the pipeline first loads in a reference (seed) dataset containing curated images of a certain style/domain +(e.g. clip art images) and then retrieves similar images to this dataset from the [LAION dataset](https://laion.ai/), a large scale public dataset containing around 5 billion images. The retrieved images then undergo additional filtering +steps to ensure that they are of good quality (e.g. single component, clean-cut). Finally, the remaining images +are then captioned using a captioning model to generate image/caption pairs for Stable Diffusion finetuning. + +### Pipeline steps + +The pipeline consists of the following components: + +**[1) Dataset loader component:](components/dataset_loader_component)** This component +loads in an image dataset from a specific [Google Cloud Storage](https://cloud.google.com/storage/docs) path and creates a +parquet file with relevant metadata (image path, format, size...). + +**[2) Image filter component:](components/image_filter_component)** This component is +used to filter the images based on the metadata +attributes to only keep images of a certain size and format. + +**[3) Image conversion component:](components/image_conversion_component)** This +component converts images from different selected formats +(currently only `png` and `svg`) to `jpg` images. This step is necessary since other images formats +are not suitable for training ML models on and often contain artifacts that can be eliminated +during conversion. + +**[4) Image embedding component:](components/image_embedding_component)** This component +extracts [image embeddings](https://rom1504.medium.com/image-embeddings-ed1b194d113e) +from the converted images using +a [CLIP model](https://www.google.com/search?q=clip+embeddings&oq=clip+embeddings&aqs=chrome..69i57j0i22i30j69i60j69i64l2j69i60j69i64j69i60.6764j0j7&sourceid=chrome&ie=UTF-8). +Since image embeddings are good at capturing the features of the image in a compact and useful way, +it +will be used in the next steps to retrieve images similar to our seed images. + +[**5) Clip retrieval component:**](components/clip_retrieval_component) This component +retrieves images from the LAION dataset using a clip +retrieval system. The images are retrieved using an efficient index built from the previously +extracted embeddings that enables for fast and accurate +querying against a large database. Checkout +this [link](https://github.com/rom1504/clip-retrieval) for information on clip retrieval. You can +also +test out clip retrieval in +this [demo](https://rom1504.github.io/clip-retrieval/?back=https%3A%2F%2Fknn5.laion.ai&index=laion5B&useMclip=false). +The output of this component is a list of URLs containing the link to the retrieved image. + +[**6) Clip downloader component:**](components/clip_downloader_component) This component +downloads the images from the list of URLs and +creates the corresponding datasets. It uses the +following [library](https://github.com/rom1504/img2dataset) +for efficient and fast image download (multi-thread approach). The images are also filtered (based +on size +and area), resized and converted during download. + +[**7) Image caption component:**](components/image_caption_component) This component +uses a captioning +model ([BLIP](https://github.com/salesforce/BLIP)) +to caption the final filtered images for training. + +## [2. SD finetuning pipeline](pipelines/sd_finetuning_pipeline.py) + +### Description + +This pipeline aims to finetune a pre-trained stable diffusion model on the collected dataset. + +### Pipeline steps + +This pipeline consists only of a single component: + +**[1) SD finetuning component:](components/sd_finetuning_component)** takes as input the +final data manifest that is output by the dataset +creation pipeline. The data +manifest keeps reference of all the necessary metadata is required for the next step such as the +reference to the filtered images and captions. The component prepares the dataset for training +according to the +required [format](https://huggingface.co/docs/datasets/image_dataset#:~:text=in%20load_dataset.-,Image%20captioning,-Image%20captioning%20datasets) +and starts the finetuning jobs. + +## **Data Manifest: a common approach to simplify different steps throughout the pipeline** +In order to keep track of the different data sources, we opt for a manifest-centered approach where +a manifest is simply a JSON file that is passed and modified throughout the different steps of the pipeline. + +```json +{ + "dataset_id":"-", + "index":"", + "associated_data":{ + "dataset":{ + "namespace_1":"", + "...":"" + }, + "caption":{ + "namespace_1":"", + "...":"" + }, + "embedding":{ + "namespace_1":"", + "commit_hash":"", + "creation_date":"", + "run_id":"" + } +} +``` +Further deep dive on some notations: + +* **namespace:** the namespace is used to identify the different data sources. For example, you can give +your seed images a specific namespace (e.g. `seed`). Then, the images retrieved with clip-retrieval will +have different namespace (e.g. `knn`, `centroid`). + +* **index**: the index denotes a unique index for each images with the format (e.g. `seed_00010`). +It indexes all the data sources in `associated_data`. +**Note**: the index keeps track of all the namespace (e.g. [`seed_00010`,`centroid_0001`, ...]) + +* **dataset**: a set of parquet files for each namespace that contain relevant metadata +(image size, location, ...) as well as the index. + +* **caption**: a set of parquet files for each namespace that contain captions +image captions as well as the index. + +* **metadata**: Helps keep track of the step that generated that manifest, code version and pipeline run id. + +The Express pipeline consists of multiple steps defines as **Express steps** that are repeated +throughout the pipeline. The manifest pattern offers the required flexibility to promote its reuse and avoid +duplication of data sources. For example: + +* **Data filtering** (e.g. filtering on image size): add new indices to the `index` but retain associated data. + +* **Data creation** (e.g. clip retrieval): add new indicies to the new `index` and another source of data under associated data with a new namespace. + +* **Data transformation** (e.g. image formatting): retain indices but replace dataset source in `dataset`. diff --git a/mlpipelines/components.config b/examples/pipelines/finetune_stable_diffusion/components.config similarity index 100% rename from mlpipelines/components.config rename to examples/pipelines/finetune_stable_diffusion/components.config diff --git a/mlpipelines/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/__init__.py similarity index 100% rename from mlpipelines/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/__init__.py diff --git a/mlpipelines/components/clip_downloader_component/Dockerfile b/examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/Dockerfile similarity index 100% rename from mlpipelines/components/clip_downloader_component/Dockerfile rename to examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/Dockerfile diff --git a/mlpipelines/components/clip_downloader_component/README.MD b/examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/README.MD similarity index 100% rename from mlpipelines/components/clip_downloader_component/README.MD rename to examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/README.MD diff --git a/mlpipelines/common/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/__init__.py similarity index 100% rename from mlpipelines/common/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/__init__.py diff --git a/mlpipelines/components/clip_downloader_component/build_image.sh b/examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/build_image.sh similarity index 100% rename from mlpipelines/components/clip_downloader_component/build_image.sh rename to examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/build_image.sh diff --git a/mlpipelines/components/clip_downloader_component/component.yaml b/examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/component.yaml similarity index 100% rename from mlpipelines/components/clip_downloader_component/component.yaml rename to examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/component.yaml diff --git a/mlpipelines/components/clip_downloader_component/requirements.txt b/examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/requirements.txt similarity index 100% rename from mlpipelines/components/clip_downloader_component/requirements.txt rename to examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/requirements.txt diff --git a/mlpipelines/common/examples/single_node/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/src/__init__.py similarity index 100% rename from mlpipelines/common/examples/single_node/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/src/__init__.py diff --git a/mlpipelines/components/clip_downloader_component/src/main.py b/examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/src/main.py similarity index 100% rename from mlpipelines/components/clip_downloader_component/src/main.py rename to examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/src/main.py diff --git a/mlpipelines/common/src/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/src/utils/__init__.py similarity index 100% rename from mlpipelines/common/src/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/src/utils/__init__.py diff --git a/mlpipelines/components/clip_downloader_component/src/utils/image_downloader.py b/examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/src/utils/image_downloader.py similarity index 100% rename from mlpipelines/components/clip_downloader_component/src/utils/image_downloader.py rename to examples/pipelines/finetune_stable_diffusion/components/clip_downloader_component/src/utils/image_downloader.py diff --git a/mlpipelines/components/clip_retrieval_component/Dockerfile b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/Dockerfile similarity index 100% rename from mlpipelines/components/clip_retrieval_component/Dockerfile rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/Dockerfile diff --git a/mlpipelines/components/clip_retrieval_component/README.MD b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/README.MD similarity index 100% rename from mlpipelines/components/clip_retrieval_component/README.MD rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/README.MD diff --git a/mlpipelines/common/src/express_components/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/__init__.py similarity index 100% rename from mlpipelines/common/src/express_components/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/__init__.py diff --git a/mlpipelines/components/clip_retrieval_component/build_image.sh b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/build_image.sh similarity index 100% rename from mlpipelines/components/clip_retrieval_component/build_image.sh rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/build_image.sh diff --git a/mlpipelines/components/clip_retrieval_component/component.yaml b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/component.yaml similarity index 100% rename from mlpipelines/components/clip_retrieval_component/component.yaml rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/component.yaml diff --git a/mlpipelines/components/clip_retrieval_component/docs/benchmark_clip_retrieval.png b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/docs/benchmark_clip_retrieval.png similarity index 100% rename from mlpipelines/components/clip_retrieval_component/docs/benchmark_clip_retrieval.png rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/docs/benchmark_clip_retrieval.png diff --git a/mlpipelines/components/clip_retrieval_component/docs/retrieved_images_centroid_example.png b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/docs/retrieved_images_centroid_example.png similarity index 100% rename from mlpipelines/components/clip_retrieval_component/docs/retrieved_images_centroid_example.png rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/docs/retrieved_images_centroid_example.png diff --git a/mlpipelines/components/clip_retrieval_component/docs/retrieved_images_knn_example.jpg b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/docs/retrieved_images_knn_example.jpg similarity index 100% rename from mlpipelines/components/clip_retrieval_component/docs/retrieved_images_knn_example.jpg rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/docs/retrieved_images_knn_example.jpg diff --git a/mlpipelines/components/clip_retrieval_component/docs/santa_captioned.jpg b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/docs/santa_captioned.jpg similarity index 100% rename from mlpipelines/components/clip_retrieval_component/docs/santa_captioned.jpg rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/docs/santa_captioned.jpg diff --git a/mlpipelines/components/clip_retrieval_component/requirements.txt b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/requirements.txt similarity index 100% rename from mlpipelines/components/clip_retrieval_component/requirements.txt rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/requirements.txt diff --git a/mlpipelines/common/src/express_components/helpers/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/src/__init__.py similarity index 100% rename from mlpipelines/common/src/express_components/helpers/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/src/__init__.py diff --git a/mlpipelines/components/clip_retrieval_component/src/main.py b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/src/main.py similarity index 100% rename from mlpipelines/components/clip_retrieval_component/src/main.py rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/src/main.py diff --git a/mlpipelines/common/src/express_components/helpers/storage/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/src/utils/__init__.py similarity index 100% rename from mlpipelines/common/src/express_components/helpers/storage/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/src/utils/__init__.py diff --git a/mlpipelines/components/clip_retrieval_component/src/utils/embedding_utils.py b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/src/utils/embedding_utils.py similarity index 100% rename from mlpipelines/components/clip_retrieval_component/src/utils/embedding_utils.py rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/src/utils/embedding_utils.py diff --git a/mlpipelines/components/clip_retrieval_component/src/utils/knn_service.py b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/src/utils/knn_service.py similarity index 100% rename from mlpipelines/components/clip_retrieval_component/src/utils/knn_service.py rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/src/utils/knn_service.py diff --git a/mlpipelines/components/clip_retrieval_component/src/utils/timer.py b/examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/src/utils/timer.py similarity index 100% rename from mlpipelines/components/clip_retrieval_component/src/utils/timer.py rename to examples/pipelines/finetune_stable_diffusion/components/clip_retrieval_component/src/utils/timer.py diff --git a/mlpipelines/components/dataset_loader_component/Dockerfile b/examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/Dockerfile similarity index 100% rename from mlpipelines/components/dataset_loader_component/Dockerfile rename to examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/Dockerfile diff --git a/mlpipelines/components/dataset_loader_component/README.MD b/examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/README.MD similarity index 100% rename from mlpipelines/components/dataset_loader_component/README.MD rename to examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/README.MD diff --git a/mlpipelines/components/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/__init__.py similarity index 100% rename from mlpipelines/components/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/__init__.py diff --git a/mlpipelines/components/dataset_loader_component/build_image.sh b/examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/build_image.sh similarity index 100% rename from mlpipelines/components/dataset_loader_component/build_image.sh rename to examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/build_image.sh diff --git a/mlpipelines/components/dataset_loader_component/component.yaml b/examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/component.yaml similarity index 100% rename from mlpipelines/components/dataset_loader_component/component.yaml rename to examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/component.yaml diff --git a/mlpipelines/components/dataset_loader_component/requirements.txt b/examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/requirements.txt similarity index 100% rename from mlpipelines/components/dataset_loader_component/requirements.txt rename to examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/requirements.txt diff --git a/mlpipelines/components/base_component/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/src/__init__.py similarity index 100% rename from mlpipelines/components/base_component/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/src/__init__.py diff --git a/mlpipelines/components/dataset_loader_component/src/main.py b/examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/src/main.py similarity index 100% rename from mlpipelines/components/dataset_loader_component/src/main.py rename to examples/pipelines/finetune_stable_diffusion/components/dataset_loader_component/src/main.py diff --git a/mlpipelines/components/image_caption_component/Dockerfile b/examples/pipelines/finetune_stable_diffusion/components/image_caption_component/Dockerfile similarity index 100% rename from mlpipelines/components/image_caption_component/Dockerfile rename to examples/pipelines/finetune_stable_diffusion/components/image_caption_component/Dockerfile diff --git a/mlpipelines/components/image_caption_component/README.MD b/examples/pipelines/finetune_stable_diffusion/components/image_caption_component/README.MD similarity index 100% rename from mlpipelines/components/image_caption_component/README.MD rename to examples/pipelines/finetune_stable_diffusion/components/image_caption_component/README.MD diff --git a/mlpipelines/components/base_component/helpers/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/image_caption_component/__init__.py similarity index 100% rename from mlpipelines/components/base_component/helpers/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/image_caption_component/__init__.py diff --git a/mlpipelines/components/image_caption_component/build_image.sh b/examples/pipelines/finetune_stable_diffusion/components/image_caption_component/build_image.sh similarity index 100% rename from mlpipelines/components/image_caption_component/build_image.sh rename to examples/pipelines/finetune_stable_diffusion/components/image_caption_component/build_image.sh diff --git a/mlpipelines/components/image_caption_component/component.yaml b/examples/pipelines/finetune_stable_diffusion/components/image_caption_component/component.yaml similarity index 100% rename from mlpipelines/components/image_caption_component/component.yaml rename to examples/pipelines/finetune_stable_diffusion/components/image_caption_component/component.yaml diff --git a/mlpipelines/components/image_caption_component/docs/santa_captioned.jpg b/examples/pipelines/finetune_stable_diffusion/components/image_caption_component/docs/santa_captioned.jpg similarity index 100% rename from mlpipelines/components/image_caption_component/docs/santa_captioned.jpg rename to examples/pipelines/finetune_stable_diffusion/components/image_caption_component/docs/santa_captioned.jpg diff --git a/mlpipelines/components/image_caption_component/requirements.txt b/examples/pipelines/finetune_stable_diffusion/components/image_caption_component/requirements.txt similarity index 100% rename from mlpipelines/components/image_caption_component/requirements.txt rename to examples/pipelines/finetune_stable_diffusion/components/image_caption_component/requirements.txt diff --git a/mlpipelines/components/clip_downloader_component/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/image_caption_component/src/__init__.py similarity index 100% rename from mlpipelines/components/clip_downloader_component/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/image_caption_component/src/__init__.py diff --git a/mlpipelines/components/image_caption_component/src/main.py b/examples/pipelines/finetune_stable_diffusion/components/image_caption_component/src/main.py similarity index 100% rename from mlpipelines/components/image_caption_component/src/main.py rename to examples/pipelines/finetune_stable_diffusion/components/image_caption_component/src/main.py diff --git a/mlpipelines/components/clip_downloader_component/src/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/image_caption_component/src/utils/__init__.py similarity index 100% rename from mlpipelines/components/clip_downloader_component/src/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/image_caption_component/src/utils/__init__.py diff --git a/mlpipelines/components/image_caption_component/src/utils/blip_model.py b/examples/pipelines/finetune_stable_diffusion/components/image_caption_component/src/utils/blip_model.py similarity index 100% rename from mlpipelines/components/image_caption_component/src/utils/blip_model.py rename to examples/pipelines/finetune_stable_diffusion/components/image_caption_component/src/utils/blip_model.py diff --git a/mlpipelines/components/image_caption_component/src/utils/image_captioning.py b/examples/pipelines/finetune_stable_diffusion/components/image_caption_component/src/utils/image_captioning.py similarity index 100% rename from mlpipelines/components/image_caption_component/src/utils/image_captioning.py rename to examples/pipelines/finetune_stable_diffusion/components/image_caption_component/src/utils/image_captioning.py diff --git a/mlpipelines/components/image_conversion_component/Dockerfile b/examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/Dockerfile similarity index 100% rename from mlpipelines/components/image_conversion_component/Dockerfile rename to examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/Dockerfile diff --git a/mlpipelines/components/image_conversion_component/README.MD b/examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/README.MD similarity index 100% rename from mlpipelines/components/image_conversion_component/README.MD rename to examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/README.MD diff --git a/mlpipelines/components/clip_downloader_component/src/utils/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/__init__.py similarity index 100% rename from mlpipelines/components/clip_downloader_component/src/utils/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/__init__.py diff --git a/mlpipelines/components/image_conversion_component/build_image.sh b/examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/build_image.sh similarity index 100% rename from mlpipelines/components/image_conversion_component/build_image.sh rename to examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/build_image.sh diff --git a/mlpipelines/components/image_conversion_component/component.yaml b/examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/component.yaml similarity index 100% rename from mlpipelines/components/image_conversion_component/component.yaml rename to examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/component.yaml diff --git a/mlpipelines/components/image_conversion_component/docs/png_normal_conversion.png b/examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/docs/png_normal_conversion.png similarity index 100% rename from mlpipelines/components/image_conversion_component/docs/png_normal_conversion.png rename to examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/docs/png_normal_conversion.png diff --git a/mlpipelines/components/image_conversion_component/requirements.txt b/examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/requirements.txt similarity index 100% rename from mlpipelines/components/image_conversion_component/requirements.txt rename to examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/requirements.txt diff --git a/mlpipelines/components/clip_retrieval_component/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/src/__init__.py similarity index 100% rename from mlpipelines/components/clip_retrieval_component/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/src/__init__.py diff --git a/mlpipelines/components/image_conversion_component/src/main.py b/examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/src/main.py similarity index 100% rename from mlpipelines/components/image_conversion_component/src/main.py rename to examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/src/main.py diff --git a/mlpipelines/components/clip_retrieval_component/src/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/src/utils/__init__.py similarity index 100% rename from mlpipelines/components/clip_retrieval_component/src/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/src/utils/__init__.py diff --git a/mlpipelines/components/image_conversion_component/src/utils/img_conversion.py b/examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/src/utils/img_conversion.py similarity index 100% rename from mlpipelines/components/image_conversion_component/src/utils/img_conversion.py rename to examples/pipelines/finetune_stable_diffusion/components/image_conversion_component/src/utils/img_conversion.py diff --git a/mlpipelines/components/image_embedding_component/Dockerfile b/examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/Dockerfile similarity index 100% rename from mlpipelines/components/image_embedding_component/Dockerfile rename to examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/Dockerfile diff --git a/mlpipelines/components/image_embedding_component/README.MD b/examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/README.MD similarity index 100% rename from mlpipelines/components/image_embedding_component/README.MD rename to examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/README.MD diff --git a/mlpipelines/components/clip_retrieval_component/src/utils/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/__init__.py similarity index 100% rename from mlpipelines/components/clip_retrieval_component/src/utils/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/__init__.py diff --git a/mlpipelines/components/image_embedding_component/build_image.sh b/examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/build_image.sh similarity index 100% rename from mlpipelines/components/image_embedding_component/build_image.sh rename to examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/build_image.sh diff --git a/mlpipelines/components/image_embedding_component/component.yaml b/examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/component.yaml similarity index 100% rename from mlpipelines/components/image_embedding_component/component.yaml rename to examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/component.yaml diff --git a/mlpipelines/components/image_embedding_component/requirements.txt b/examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/requirements.txt similarity index 100% rename from mlpipelines/components/image_embedding_component/requirements.txt rename to examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/requirements.txt diff --git a/mlpipelines/components/dataset_loader_component/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/src/__init__.py similarity index 100% rename from mlpipelines/components/dataset_loader_component/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/src/__init__.py diff --git a/mlpipelines/components/image_embedding_component/src/main.py b/examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/src/main.py similarity index 100% rename from mlpipelines/components/image_embedding_component/src/main.py rename to examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/src/main.py diff --git a/mlpipelines/components/dataset_loader_component/src/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/src/utils/__init__.py similarity index 100% rename from mlpipelines/components/dataset_loader_component/src/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/src/utils/__init__.py diff --git a/mlpipelines/components/image_embedding_component/src/utils/image_embedding.py b/examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/src/utils/image_embedding.py similarity index 100% rename from mlpipelines/components/image_embedding_component/src/utils/image_embedding.py rename to examples/pipelines/finetune_stable_diffusion/components/image_embedding_component/src/utils/image_embedding.py diff --git a/mlpipelines/components/image_filter_component/Dockerfile b/examples/pipelines/finetune_stable_diffusion/components/image_filter_component/Dockerfile similarity index 100% rename from mlpipelines/components/image_filter_component/Dockerfile rename to examples/pipelines/finetune_stable_diffusion/components/image_filter_component/Dockerfile diff --git a/mlpipelines/components/image_filter_component/README.MD b/examples/pipelines/finetune_stable_diffusion/components/image_filter_component/README.MD similarity index 100% rename from mlpipelines/components/image_filter_component/README.MD rename to examples/pipelines/finetune_stable_diffusion/components/image_filter_component/README.MD diff --git a/mlpipelines/components/image_caption_component/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/image_filter_component/__init__.py similarity index 100% rename from mlpipelines/components/image_caption_component/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/image_filter_component/__init__.py diff --git a/mlpipelines/components/image_filter_component/build_image.sh b/examples/pipelines/finetune_stable_diffusion/components/image_filter_component/build_image.sh similarity index 100% rename from mlpipelines/components/image_filter_component/build_image.sh rename to examples/pipelines/finetune_stable_diffusion/components/image_filter_component/build_image.sh diff --git a/mlpipelines/components/image_filter_component/component.yaml b/examples/pipelines/finetune_stable_diffusion/components/image_filter_component/component.yaml similarity index 100% rename from mlpipelines/components/image_filter_component/component.yaml rename to examples/pipelines/finetune_stable_diffusion/components/image_filter_component/component.yaml diff --git a/mlpipelines/components/image_filter_component/requirements.txt b/examples/pipelines/finetune_stable_diffusion/components/image_filter_component/requirements.txt similarity index 100% rename from mlpipelines/components/image_filter_component/requirements.txt rename to examples/pipelines/finetune_stable_diffusion/components/image_filter_component/requirements.txt diff --git a/mlpipelines/components/image_caption_component/src/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/image_filter_component/src/__init__.py similarity index 100% rename from mlpipelines/components/image_caption_component/src/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/image_filter_component/src/__init__.py diff --git a/mlpipelines/components/image_filter_component/src/main.py b/examples/pipelines/finetune_stable_diffusion/components/image_filter_component/src/main.py similarity index 100% rename from mlpipelines/components/image_filter_component/src/main.py rename to examples/pipelines/finetune_stable_diffusion/components/image_filter_component/src/main.py diff --git a/mlpipelines/components/sd_finetuning_component/Dockerfile b/examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/Dockerfile similarity index 100% rename from mlpipelines/components/sd_finetuning_component/Dockerfile rename to examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/Dockerfile diff --git a/mlpipelines/components/sd_finetuning_component/README.MD b/examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/README.MD similarity index 100% rename from mlpipelines/components/sd_finetuning_component/README.MD rename to examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/README.MD diff --git a/mlpipelines/components/image_caption_component/src/utils/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/__init__.py similarity index 100% rename from mlpipelines/components/image_caption_component/src/utils/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/__init__.py diff --git a/mlpipelines/components/sd_finetuning_component/build_image.sh b/examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/build_image.sh similarity index 100% rename from mlpipelines/components/sd_finetuning_component/build_image.sh rename to examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/build_image.sh diff --git a/mlpipelines/components/sd_finetuning_component/component.yaml b/examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/component.yaml similarity index 100% rename from mlpipelines/components/sd_finetuning_component/component.yaml rename to examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/component.yaml diff --git a/mlpipelines/components/sd_finetuning_component/requirements.txt b/examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/requirements.txt similarity index 100% rename from mlpipelines/components/sd_finetuning_component/requirements.txt rename to examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/requirements.txt diff --git a/mlpipelines/components/image_conversion_component/__init__.py b/examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/src/__init__.py similarity index 100% rename from mlpipelines/components/image_conversion_component/__init__.py rename to examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/src/__init__.py diff --git a/mlpipelines/components/sd_finetuning_component/src/accelerate_config.yaml b/examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/src/accelerate_config.yaml similarity index 100% rename from mlpipelines/components/sd_finetuning_component/src/accelerate_config.yaml rename to examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/src/accelerate_config.yaml diff --git a/mlpipelines/components/sd_finetuning_component/src/main.py b/examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/src/main.py similarity index 100% rename from mlpipelines/components/sd_finetuning_component/src/main.py rename to examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/src/main.py diff --git a/mlpipelines/components/sd_finetuning_component/src/train_text_to_image.py b/examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/src/train_text_to_image.py similarity index 100% rename from mlpipelines/components/sd_finetuning_component/src/train_text_to_image.py rename to examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/src/train_text_to_image.py diff --git a/mlpipelines/components/sd_finetuning_component/src/utils/dataset_utils.py b/examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/src/utils/dataset_utils.py similarity index 100% rename from mlpipelines/components/sd_finetuning_component/src/utils/dataset_utils.py rename to examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/src/utils/dataset_utils.py diff --git a/mlpipelines/components/sd_finetuning_component/src/utils/training_utils.py b/examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/src/utils/training_utils.py similarity index 100% rename from mlpipelines/components/sd_finetuning_component/src/utils/training_utils.py rename to examples/pipelines/finetune_stable_diffusion/components/sd_finetuning_component/src/utils/training_utils.py diff --git a/mlpipelines/pipelines/config/dataset_creation_config.py b/examples/pipelines/finetune_stable_diffusion/config/dataset_creation_config.py similarity index 100% rename from mlpipelines/pipelines/config/dataset_creation_config.py rename to examples/pipelines/finetune_stable_diffusion/config/dataset_creation_config.py diff --git a/mlpipelines/pipelines/config/general_config.py b/examples/pipelines/finetune_stable_diffusion/config/general_config.py similarity index 100% rename from mlpipelines/pipelines/config/general_config.py rename to examples/pipelines/finetune_stable_diffusion/config/general_config.py diff --git a/mlpipelines/pipelines/config/sd_finetuning_config.py b/examples/pipelines/finetune_stable_diffusion/config/sd_finetuning_config.py similarity index 100% rename from mlpipelines/pipelines/config/sd_finetuning_config.py rename to examples/pipelines/finetune_stable_diffusion/config/sd_finetuning_config.py diff --git a/mlpipelines/components/image_conversion_component/src/__init__.py b/examples/pipelines/finetune_stable_diffusion/helpers/__init__.py similarity index 100% rename from mlpipelines/components/image_conversion_component/src/__init__.py rename to examples/pipelines/finetune_stable_diffusion/helpers/__init__.py diff --git a/mlpipelines/pipelines/helpers/lightweight.py b/examples/pipelines/finetune_stable_diffusion/helpers/lightweight.py similarity index 100% rename from mlpipelines/pipelines/helpers/lightweight.py rename to examples/pipelines/finetune_stable_diffusion/helpers/lightweight.py diff --git a/mlpipelines/pipelines/helpers/upload.py b/examples/pipelines/finetune_stable_diffusion/helpers/upload.py similarity index 100% rename from mlpipelines/pipelines/helpers/upload.py rename to examples/pipelines/finetune_stable_diffusion/helpers/upload.py diff --git a/mlpipelines/components/image_conversion_component/src/utils/__init__.py b/examples/pipelines/finetune_stable_diffusion/pipelines/__init__.py similarity index 100% rename from mlpipelines/components/image_conversion_component/src/utils/__init__.py rename to examples/pipelines/finetune_stable_diffusion/pipelines/__init__.py diff --git a/mlpipelines/pipelines/dataset_creation_pipeline.py b/examples/pipelines/finetune_stable_diffusion/pipelines/dataset_creation_pipeline.py similarity index 98% rename from mlpipelines/pipelines/dataset_creation_pipeline.py rename to examples/pipelines/finetune_stable_diffusion/pipelines/dataset_creation_pipeline.py index b7655300f..bee274127 100644 --- a/mlpipelines/pipelines/dataset_creation_pipeline.py +++ b/examples/pipelines/finetune_stable_diffusion/pipelines/dataset_creation_pipeline.py @@ -29,6 +29,8 @@ '../components/clip_retrieval_component/component.yaml') clip_downloader_component = comp.load_component( '../components/clip_downloader_component/component.yaml') +image_classifier_component = comp.load_component( + '../components/image_classifier_component/component.yaml') image_caption_component = comp.load_component( '../components/image_caption_component/component.yaml') diff --git a/mlpipelines/pipelines/requirements.txt b/examples/pipelines/finetune_stable_diffusion/pipelines/requirements.txt similarity index 100% rename from mlpipelines/pipelines/requirements.txt rename to examples/pipelines/finetune_stable_diffusion/pipelines/requirements.txt diff --git a/mlpipelines/pipelines/sd_finetuning_pipeline.py b/examples/pipelines/finetune_stable_diffusion/pipelines/sd_finetuning_pipeline.py similarity index 100% rename from mlpipelines/pipelines/sd_finetuning_pipeline.py rename to examples/pipelines/finetune_stable_diffusion/pipelines/sd_finetuning_pipeline.py diff --git a/mlpipelines/project.config b/examples/pipelines/finetune_stable_diffusion/project.config similarity index 100% rename from mlpipelines/project.config rename to examples/pipelines/finetune_stable_diffusion/project.config diff --git a/mlpipelines/start_cluster.sh b/examples/pipelines/finetune_stable_diffusion/start_cluster.sh similarity index 100% rename from mlpipelines/start_cluster.sh rename to examples/pipelines/finetune_stable_diffusion/start_cluster.sh diff --git a/mlpipelines/stop_cluster.sh b/examples/pipelines/finetune_stable_diffusion/stop_cluster.sh similarity index 100% rename from mlpipelines/stop_cluster.sh rename to examples/pipelines/finetune_stable_diffusion/stop_cluster.sh diff --git a/mlpipelines/components/image_embedding_component/__init__.py b/express/__init__.py similarity index 100% rename from mlpipelines/components/image_embedding_component/__init__.py rename to express/__init__.py diff --git a/express/components/__init__.py b/express/components/__init__.py new file mode 100644 index 000000000..a9429195d --- /dev/null +++ b/express/components/__init__.py @@ -0,0 +1 @@ +from .pandas_components import PandasLoaderComponent, PandasTransformComponent \ No newline at end of file diff --git a/mlpipelines/common/src/express_components/common.py b/express/components/common.py similarity index 99% rename from mlpipelines/common/src/express_components/common.py rename to express/components/common.py index cc7dff10f..5257b8151 100644 --- a/mlpipelines/common/src/express_components/common.py +++ b/express/components/common.py @@ -11,8 +11,8 @@ from pathlib import Path from typing import Dict, Optional, TypeVar, Generic, Union -from .helpers.manifest_helpers import DataManifest, DataSource, Metadata -from .helpers.storage.storage_interface import StorageHandlerModule +from express.manifest import DataManifest, DataSource, Metadata +from express.storage_interface import StorageHandlerModule # pylint: disable=no-member STORAGE_MODULE_PATH = StorageHandlerModule().to_dict()[os.environ.get('CLOUD_ENV', 'GCP')] @@ -432,4 +432,4 @@ def load(cls, extra_args: Optional[Dict[str, Union[str, int, float, bool]]] = No Returns: ExpressDatasetDraft[TIndex, TData]: draft of output dataset, to be uploaded after this loader completes. - """ + """ \ No newline at end of file diff --git a/mlpipelines/common/src/express_components/pandas_components.py b/express/components/pandas_components.py similarity index 96% rename from mlpipelines/common/src/express_components/pandas_components.py rename to express/components/pandas_components.py index 3abd01508..f80cd64de 100644 --- a/mlpipelines/common/src/express_components/pandas_components.py +++ b/express/components/pandas_components.py @@ -1,4 +1,5 @@ """Pandas single component module """ + import os import importlib import tempfile @@ -9,8 +10,8 @@ from .common import ExpressDatasetHandler, ExpressDataset, ExpressTransformComponent, \ ExpressDatasetDraft, ExpressLoaderComponent -from .helpers.storage.storage_interface import StorageHandlerModule -from .helpers.manifest_helpers import DataManifest, DataSource, DataType +from express.storage_interface import StorageHandlerModule +from express.manifest import DataManifest, DataSource, DataType # Define interface of pandas draft # pylint: disable=unsubscriptable-object @@ -126,4 +127,4 @@ class PandasLoaderComponent(PandasDatasetHandler, ExpressLoaderComponent[List[st @abstractmethod def load(cls, extra_args: Optional[Dict[str, Union[str, int, float, bool]]] = None) \ -> PandasDatasetDraft: - """Load initial dataset""" + """Load initial dataset""" \ No newline at end of file diff --git a/mlpipelines/common/src/express_components/helpers/storage/gcp_storage.py b/express/gcp_storage.py similarity index 94% rename from mlpipelines/common/src/express_components/helpers/storage/gcp_storage.py rename to express/gcp_storage.py index e00d949c5..67841f71c 100644 --- a/mlpipelines/common/src/express_components/helpers/storage/gcp_storage.py +++ b/express/gcp_storage.py @@ -8,8 +8,8 @@ from typing import List from urllib.parse import urlparse -from .storage_interface import StorageHandlerInterface, DecodedBlobPath -from ...helpers import io_helpers +from express.storage_interface import StorageHandlerInterface, DecodedBlobPath +from express import io LOGGER = logging.getLogger(__name__) @@ -89,7 +89,7 @@ def copy_folder(source: str, destination: str, copy_source_content: bool = False LOGGER.info("Copying folder from %s to %s [DONE]", source, destination) - folder_name = io_helpers.get_file_name(source) + folder_name = io.get_file_name(source) return os.path.join(destination, folder_name) @@ -132,5 +132,5 @@ def copy_file(self, source_file: str, destination: str) -> str: str: the path where the file was copied to """ self.copy_files([source_file], destination) - file_name = io_helpers.get_file_name(source_file, return_extension=True) - return os.path.join(destination, file_name) + file_name = io.get_file_name(source_file, return_extension=True) + return os.path.join(destination, file_name) \ No newline at end of file diff --git a/mlpipelines/common/src/express_components/helpers/io_helpers.py b/express/io.py similarity index 100% rename from mlpipelines/common/src/express_components/helpers/io_helpers.py rename to express/io.py diff --git a/express/kfp.py b/express/kfp.py new file mode 100644 index 000000000..0f9451ba8 --- /dev/null +++ b/express/kfp.py @@ -0,0 +1,22 @@ +"""General kfp helpers""" +import ast +import logging + +import torch + +LOGGER = logging.getLogger(__name__) + + +def parse_kfp_list(kfp_parsed_string: str) -> list: + """ + This is mainly to resolve issues in passing a list to kfp components. kfp will return a json + string object instead of a list. This function parses the json string to return the + original list + Reference: https://stackoverflow.com/questions/57806505/in-kubeflow-pipelines-how-to + -send-a-list-of-elements-to-a-lightweight-python-co + Args: + kfp_parsed_string (str): the list string to parse with format: '[',l',i','s','t']' + Returns: + list: the list representation of the json string + """ + return ast.literal_eval("".join(kfp_parsed_string)) diff --git a/mlpipelines/common/src/express_components/helpers/logger.py b/express/logger.py similarity index 94% rename from mlpipelines/common/src/express_components/helpers/logger.py rename to express/logger.py index 6ef49da18..3e1dbd945 100644 --- a/mlpipelines/common/src/express_components/helpers/logger.py +++ b/express/logger.py @@ -19,4 +19,4 @@ def configure_logging() -> None: handler = logging.StreamHandler(stream=sys.stdout) handler.setFormatter( logging.Formatter(fmt='[%(asctime)s | %(name)s | %(levelname)s] %(message)s')) - logger.addHandler(handler) + logger.addHandler(handler) \ No newline at end of file diff --git a/mlpipelines/common/src/express_components/helpers/manifest_helpers.py b/express/manifest.py similarity index 98% rename from mlpipelines/common/src/express_components/helpers/manifest_helpers.py rename to express/manifest.py index c4b4d0ca0..4c0005757 100644 --- a/mlpipelines/common/src/express_components/helpers/manifest_helpers.py +++ b/express/manifest.py @@ -103,4 +103,4 @@ def from_path(cls, manifest_path): with open(manifest_path) as f: manifest_load = json.load(f) # pylint: disable=no-member - return DataManifest.from_dict(manifest_load) + return DataManifest.from_dict(manifest_load) \ No newline at end of file diff --git a/mlpipelines/components/base_component/helpers/manifest_example.json b/express/manifest_example.json similarity index 100% rename from mlpipelines/components/base_component/helpers/manifest_example.json rename to express/manifest_example.json diff --git a/mlpipelines/common/src/express_components/helpers/storage/storage_interface.py b/express/storage_interface.py similarity index 94% rename from mlpipelines/common/src/express_components/helpers/storage/storage_interface.py rename to express/storage_interface.py index 4893242e6..b9b656303 100644 --- a/mlpipelines/common/src/express_components/helpers/storage/storage_interface.py +++ b/express/storage_interface.py @@ -14,9 +14,9 @@ class StorageHandlerModule: """Datclass to define module path for the different cloud Storage Handlers""" # pylint: disable=invalid-name - GCP: str = "express_components.helpers.storage.gcp_storage" - AWS: str = "express_components.helpers.storage.aws_storage" - AZURE: str = "express_components.helpers.storage.azure_storage" + GCP: str = "express.gcp_storage" + AWS: str = "express.aws_storage" + AZURE: str = "express.azure_storage" @dataclass_json @@ -128,4 +128,4 @@ def copy_parquet(self, parquet_path: str, destination: str) -> str: parquet_path, destination) - return local_parquet_path + return local_parquet_path \ No newline at end of file diff --git a/mlpipelines/common/Dockerfile b/mlpipelines/common/Dockerfile deleted file mode 100644 index 1288d83cc..000000000 --- a/mlpipelines/common/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime - -# install requirements -RUN pip3 install --upgrade pip - -# Install system dependencies -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install curl -y - -# Install gcloud package -RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz -RUN mkdir -p /usr/local/gcloud \ - && tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \ - && /usr/local/gcloud/google-cloud-sdk/install.sh -ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin - -# Install local sources -RUN mkdir temp_src -COPY . temp_src -RUN pip3 install ./temp_src -RUN rm -rf temp_src - -ENTRYPOINT ["echo", "This base component should not be called directly.", \ - "Received arguments:"] \ No newline at end of file diff --git a/mlpipelines/common/build_image.sh b/mlpipelines/common/build_image.sh deleted file mode 100644 index 06250a403..000000000 --- a/mlpipelines/common/build_image.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -e -# Copyright 2018 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Read component configs -. ../project.config - -# Set some variables -ARTIFACT_PATH=${PROJECT_ARTIFACT_PATH} -IMAGE_NAME="components/${PWD##*/}" -IMAGE_TAG="latest" - -# Create full name of the image -FULL_IMAGE_NAME=${ARTIFACT_PATH}/${IMAGE_NAME}:${IMAGE_TAG} -echo $FULL_IMAGE_NAME - -gcloud builds submit --machine-type n1-highcpu-32 . -t "$FULL_IMAGE_NAME" \ No newline at end of file diff --git a/mlpipelines/common/setup.cfg b/mlpipelines/common/setup.cfg deleted file mode 100644 index e17e9fb5e..000000000 --- a/mlpipelines/common/setup.cfg +++ /dev/null @@ -1,21 +0,0 @@ -[metadata] -name = express_components -version = 0.0.1 -author = ML6 -description = Common logic and components for Express Pipelines -keywords = express, kfp -classifiers = - Programming Language :: Python :: 3 - Programming Language :: Python :: 3.7.9 - -[options] -install_requires = - dataclasses-json==0.5.7 - google-cloud-storage==1.44.0 - kfp==1.8.14 - pandas==1.3.5 - pyarrow==10.0.0 - torch==1.12.0 - -[bdist_wheel] -python-tag=py37 diff --git a/mlpipelines/common/setup.py b/mlpipelines/common/setup.py deleted file mode 100644 index 7aaa6ff10..000000000 --- a/mlpipelines/common/setup.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Setup""" -import setuptools - -setuptools.setup() diff --git a/mlpipelines/common/src/express_components/helpers/kfp_helpers.py b/mlpipelines/common/src/express_components/helpers/kfp_helpers.py deleted file mode 100644 index f65290749..000000000 --- a/mlpipelines/common/src/express_components/helpers/kfp_helpers.py +++ /dev/null @@ -1,36 +0,0 @@ -"""General kfp helpers""" -import ast -import logging - -import torch - -LOGGER = logging.getLogger(__name__) - - -def parse_kfp_list(kfp_parsed_string: str) -> list: - """ - This is mainly to resolve issues in passing a list to kfp components passed as - 'JsonArray`. kfp will return a json string object instead of a list. This function parses - the json string to return the original list - Reference: https://stackoverflow.com/questions/57806505/in-kubeflow-pipelines-how-to - -send-a-list-of-elements-to-a-lightweight-python-co - Args: - kfp_parsed_string (str): the list string to parse with format: '[',l',i','s','t']' - Returns: - list: the list representation of the json string - """ - return ast.literal_eval("".join(kfp_parsed_string)) - - -def get_cuda_availability(): - """Function that checks if a cuda device is available""" - cuda_available = torch.cuda.is_available() - LOGGER.info("CUDA device availability:%s", cuda_available) - - if cuda_available: - LOGGER.info(torch.cuda.get_device_name(0)) - LOGGER.info("CUDA device: %s", torch.cuda.get_device_name(0)) - LOGGER.info("Num of GPUs: %s", torch.cuda.device_count()) - LOGGER.info("Memory Usage:") - LOGGER.info("Allocated: %s GB", round(torch.cuda.memory_allocated(0) / 1024 ** 3, 1)) - LOGGER.info("Cached: %s GB", round(torch.cuda.memory_reserved(0) / 1024 ** 3, 1)) diff --git a/mlpipelines/common/src/express_components/helpers/manifest_example.json b/mlpipelines/common/src/express_components/helpers/manifest_example.json deleted file mode 100644 index fabcea55f..000000000 --- a/mlpipelines/common/src/express_components/helpers/manifest_example.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_id":"123456789", - "index": { - "location": "gs://express/clip/run456/output_index", - "type": "parquet", - "extensions": ["parquet"], - "n_files": 1, - "n_items": 1091 - }, - "data_sources": { - "image_metadata": { - "location": "gs://express/express_loader/run123/image_metadata", - "type": "parquet", - "extensions": ["parquet"], - "n_files": 6, - "n_items": 1234 - }, - "images": { - "location": "gs://express/express_loader/run123/images", - "type": "blob", - "extensions": ["svg", "png", "jpg"], - "n_files": 1122, - "n_items": 1122 - }, - "clip_embeddings": { - "location": "gs://express/clip/run456/clip_embeddings", - "type": "blob", - "extensions": ["npy"], - "n_files": 1091, - "n_items": 1091 - } - }, - "metadata":{ - "branch":"", - "commit_hash":"", - "creation_date":"", - "run_id":"run789", - "n_items": 998 - } -} diff --git a/mlpipelines/components/base_component/Dockerfile b/mlpipelines/components/base_component/Dockerfile deleted file mode 100644 index 81810ad97..000000000 --- a/mlpipelines/components/base_component/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime - -# Install system dependencies -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install curl -y - -# Downloading gcloud package -RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz - -# Installing the package -RUN mkdir -p /usr/local/gcloud \ - && tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \ - && /usr/local/gcloud/google-cloud-sdk/install.sh - -# Adding the package path to local -ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin - -# install requirements -RUN pip3 install --upgrade pip -COPY requirements.txt / -RUN pip3 install -r /requirements.txt - -# Set the working directory to the source folder -WORKDIR /src - -# Copy over src-files of the component -COPY helpers /src/helpers diff --git a/mlpipelines/components/base_component/README.MD b/mlpipelines/components/base_component/README.MD deleted file mode 100644 index 7579e8946..000000000 --- a/mlpipelines/components/base_component/README.MD +++ /dev/null @@ -1,27 +0,0 @@ -# base_component - -### Description - -This component contains common **system dependencies** and **helper functions** that will be re-used across the different component. It -will act as base component for the other components to build open (checkout this [link](https://docs.docker.com/build/building/multi-stage/#:~:text=Use%20multi%2Dstage%20builds%F0%9F%94%97) -for more info on multi-stage builds). - -### **System Dependencies** -System dependencies include pytorch Cuda dependencies and the Google Cloud SDK which are needed to run -machine learning models and read/write to Google Cloud Storage respectively. - -### **Helper functions** -Consists of the following files: - -1. **io_helpers.py**: general I/O helper functions. -2. **kfp_helpers.py**: include helper functions for GPU logging when running a KFP component and parsing specific KFP input. -3. **logger.py**: General logger module for event logging. -4. **manifest_helpers.py**: Defines the structure of the data manifest that holds the location and contents of the different data sources. -5. **parquet_helpers.py**: General helper functions for creating and updating the different parquet files that contain the index and data sources information (metadata, captions, ...) as well -as various utilities (duplication removal, metadata and column retrieval, ...). -6. **storage_helpers.py**: helper functions to interact with Google Cloud Storage (blob listing, I/O, ...) - -### **Practical considerations** - -* Make sure to build this component first since the other components will fetch it during multi-stage buildings. -* Any changes to this component will require you to rebuild this component and the other components in order for the changes to take place. \ No newline at end of file diff --git a/mlpipelines/components/base_component/a.py b/mlpipelines/components/base_component/a.py deleted file mode 100644 index 65145cf3b..000000000 --- a/mlpipelines/components/base_component/a.py +++ /dev/null @@ -1,10 +0,0 @@ -from helpers.logger import get_logger -import logging -logger = get_logger(name=__name__, level=logging.INFO) - - -logger.info( - "A total of %s images out of %s (%s%%) were filtered", - 1, - 2, - 3) \ No newline at end of file diff --git a/mlpipelines/components/base_component/build_image.sh b/mlpipelines/components/base_component/build_image.sh deleted file mode 100644 index 55457c0f4..000000000 --- a/mlpipelines/components/base_component/build_image.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -e -# Copyright 2018 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Read component configs -. ../../components.config - -# Set some variables -ARTIFACT_PATH=${PROJECT_ARTIFACT_PATH} -IMAGE_NAME="components/${PWD##*/}" -IMAGE_TAG="latest" - -# Create full name of the image -FULL_IMAGE_NAME=${ARTIFACT_PATH}/${IMAGE_NAME}:${IMAGE_TAG} -echo $FULL_IMAGE_NAME - -gcloud builds submit --machine-type n1-highcpu-32 . -t "$FULL_IMAGE_NAME" \ No newline at end of file diff --git a/mlpipelines/components/base_component/helpers/io_helpers.py b/mlpipelines/components/base_component/helpers/io_helpers.py deleted file mode 100644 index 32d5e3d91..000000000 --- a/mlpipelines/components/base_component/helpers/io_helpers.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -General I/O helpers function -""" -import pathlib -import logging -import os - -# pylint: disable=import-error -from helpers.logger import get_logger - -LOGGER = get_logger(name=__name__, level=logging.INFO) - - -def get_file_extension(file_url: str) -> str: - """ - Function that returns a file extension from a file url - Args: - file_url (str): the file url to return the extension from - Returns: - (str): the file extension - """ - return pathlib.Path(file_url).suffix[1:] - - -def get_file_name(file_uri: str, return_extension=False): - """ - Function that returns the file name from a given gcs uri - Args: - file_uri (str): the file uri - return_extension (bool): a boolean to indicate whether to return the file extension or not - """ - path, extension = os.path.splitext(file_uri) - file_name = os.path.basename(path) - if return_extension: - file_name = f"{file_name}{extension}" - return file_name diff --git a/mlpipelines/components/base_component/helpers/kfp_helpers.py b/mlpipelines/components/base_component/helpers/kfp_helpers.py deleted file mode 100644 index ee1ca76f5..000000000 --- a/mlpipelines/components/base_component/helpers/kfp_helpers.py +++ /dev/null @@ -1,38 +0,0 @@ -"""General kfp helpers""" -import ast -import logging - -import torch - -from helpers.logger import get_logger - -logger = get_logger(name=__name__, level=logging.INFO) - - -def parse_kfp_list(kfp_parsed_string: str) -> list: - """ - This is mainly to resolve issues in passing a list to kfp components. kfp will return a json - string object instead of a list. This function parses the json string to return the - original list - Reference: https://stackoverflow.com/questions/57806505/in-kubeflow-pipelines-how-to - -send-a-list-of-elements-to-a-lightweight-python-co - Args: - kfp_parsed_string (str): the list string to parse with format: '[',l',i','s','t']' - Returns: - list: the list representation of the json string - """ - return ast.literal_eval("".join(kfp_parsed_string)) - - -def get_cuda_availability(): - """Function that checks if a cuda device is available""" - cuda_available = torch.cuda.is_available() - logger.info("CUDA device availability:%s", cuda_available) - - if cuda_available: - logger.info(torch.cuda.get_device_name(0)) - logger.info("CUDA device: %s", torch.cuda.get_device_name(0)) - logger.info("Num of GPUs: %s", torch.cuda.device_count()) - logger.info("Memory Usage:") - logger.info("Allocated: %s GB", round(torch.cuda.memory_allocated(0) / 1024 ** 3, 1)) - logger.info("Cached: %s GB", round(torch.cuda.memory_reserved(0) / 1024 ** 3, 1)) diff --git a/mlpipelines/components/base_component/helpers/logger.py b/mlpipelines/components/base_component/helpers/logger.py deleted file mode 100644 index a07918350..000000000 --- a/mlpipelines/components/base_component/helpers/logger.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Logger module""" - -import sys -import logging - - -def get_logger(name: str, level) -> logging.Logger: - """Return logger for calling module - Args: - name (str): the logger name - level ([type]): the logger level - Returns: - logging.Logger: the logger instance - """ - logger = logging.getLogger(name) - logger.setLevel(level) - handler = logging.StreamHandler(stream=sys.stdout) - handler.setFormatter(logging.Formatter(fmt='[%(asctime)s] | %(levelname)s] %(message)s')) - logger.addHandler(handler) - return logger diff --git a/mlpipelines/components/base_component/helpers/manifest_helpers.py b/mlpipelines/components/base_component/helpers/manifest_helpers.py deleted file mode 100644 index 848b20c2c..000000000 --- a/mlpipelines/components/base_component/helpers/manifest_helpers.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Script for generating the dataset manifest that will be passed and updated through different -components of the pipeline -""" -from dataclasses import dataclass, field -from typing import Dict - -from dataclasses_json import dataclass_json - - -@dataclass -class AssociatedData: - """ - The data information associated with the manifest - Args: - dataset (str): the path to the dataset parquet file containing image ids, gcs paths and - their associated metadata (url, format, ...) - caption (str): the path to the caption parquet file containing image ids and their - associated captions - embedding (str): the path to the embedding gcs url containing the `.npy` image embeddings - """ - dataset: Dict[str, str] = field(default_factory=dict) - caption: Dict[str, str] = field(default_factory=dict) - embedding: Dict[str, str] = field(default_factory=dict) - - -@dataclass -class Metadata: - """ - The metadata associated with the manifest - Args: - branch (str): the git branch associated with that manifest - commit_hash (str): the commit hash associated with that manifest - creation_date (str): the creation date of the manifest - run_id (str): the kfp run id associated with the manifest (kfp.dsl.EXECUTION_ID_PLACEHOLDER) - """ - branch: str = field(default_factory=str) - commit_hash: str = field(default_factory=str) - creation_date: str = field(default_factory=str) - run_id: str = field(default_factory=str) - - -@dataclass_json -@dataclass -class DataManifest: - """ - The dataset Manifest - Args: - dataset_id (str): the id of the dataset - index (str): the path to the index parquet file containing the image ids of the - images to process - associated_data (AssociatedData): The data information associated with the manifest - metadata (Metadata): The metadata associated with the manifest - - """ - dataset_id: str = field(default_factory=str) - index: str = field(default_factory=str) - associated_data: AssociatedData = field(default_factory=AssociatedData) - metadata: Metadata = field(default_factory=Metadata) diff --git a/mlpipelines/components/base_component/helpers/parquet_helpers.py b/mlpipelines/components/base_component/helpers/parquet_helpers.py deleted file mode 100644 index 80ffa63c3..000000000 --- a/mlpipelines/components/base_component/helpers/parquet_helpers.py +++ /dev/null @@ -1,309 +0,0 @@ -""" -General helpers functions for reading and updating parquet datasets -""" - -import os -import itertools -import logging -from typing import Dict, Callable, List, Union, Iterable, Any, Optional - -import pandas as pd -import pyarrow as pa -import pyarrow.parquet as pq -import pyarrow.dataset as ds -import pyarrow.compute as pc -from pyarrow.dataset import Expression, Scanner - -# pylint: disable=import-error -from helpers.logger import get_logger -from helpers import io_helpers - -LOGGER = get_logger(name=__name__, level=logging.INFO) - - -def get_batches_from_iterable(data_iterable: Iterable, schema: pa.schema, - chunk_size: int) -> pa.record_batch: - """ - Function that returns batches from an iterable to write to a parquet dataset - Args: - data_iterable (Callable): an iterator that yields a tuple. Used to return - the data to write (in chunks) - schema (List[Tuple[str, any]): the schema of the file to write - chunk_size (int): the chunk size that is used to restrict the number of rows that are - being read and written to the parquet file on each loop (avoid memory overflow when - reading large amounts of data) - - """ - rows_it = iter(data_iterable) - while True: - batch = pa.RecordBatch.from_pandas( - df=pd.DataFrame(itertools.islice(rows_it, chunk_size), columns=schema.names), - schema=schema, preserve_index=False) - if not batch: - break - yield batch - - -def write_parquet_file(parquet_path: str, data_iterable: Iterable, - schema: pa.schema, chunk_size: int = 1000) -> None: - """ - Function that writes a parquet file from an iterable functions and stores it in a local storage - Args: - parquet_path (str): the path to the parquet file - data_iterable (Callable): an iterator that yields a tuple. Used to return - the data to write (in chunks) - schema (List[Tuple[str, any]): the schema of the file to write - chunk_size (int): the chunk size that is used to restrict the number of rows that are - being read and written to the parquet file on each loop (avoid memory overflow when - reading large amounts of data) - """ - - batches = get_batches_from_iterable(data_iterable=data_iterable, schema=schema, - chunk_size=chunk_size) - - # Write the batches - with pq.ParquetWriter(parquet_path, schema=schema) as writer: - for batch in batches: - writer.write_batch(batch) - - -def append_to_parquet_file(parquet_path: str, data_iterable: Iterable, tmp_path: str, - chunk_size: int = 1000) -> None: - """ - Function that append the results of an iterable function to an existing parquet file - Args: - parquet_path (str): the path to the parquet file to append to - data_iterable (Callable): an iterator that yields a tuple. Used to return - the data to write (in chunks) - tmp_path (str): the temporary path where to write the temporary appended parquet file - chunk_size (int): the chunk size that is used to restrict the number of rows that are - being read and written to the parquet file on each loop (avoid memory overflow when - reading large amounts of data) - """ - - src_dataset = ds.dataset(parquet_path) - src_batches = src_dataset.to_batches() - batches_to_append = get_batches_from_iterable(data_iterable=data_iterable, - schema=src_dataset.schema, - chunk_size=chunk_size) - # Results will be written to a temporary parquet first since we cannot overwrite the original - # while reading batches from it - file_name = f"{io_helpers.get_file_name(parquet_path)}_tmp.parquet" - tmp_parquet_path = os.path.join(tmp_path, file_name) - # Write the batches - with pq.ParquetWriter(tmp_parquet_path, schema=src_dataset.schema) as writer: - for src_batch in src_batches: - writer.write_batch(src_batch) - # Write the appended batches - for batch_to_append in batches_to_append: - writer.write_batch(batch_to_append) - os.replace(tmp_parquet_path, parquet_path) - - -def get_column_list_from_parquet(parquet_scanner_or_path: Union[str, Scanner], - column_name: str) -> List[Any]: - """ - Function that returns a defined parquet column as a list - Args: - parquet_scanner_or_path: Union[str, Scanner]: a path to a parquet file or the - parquet dataset - column_name (str): the name of the column - Returns: - List[str]: a list containing the column rows - """ - column_list = [] - if isinstance(parquet_scanner_or_path, str): - parquet_batches_getter = ds.dataset(parquet_scanner_or_path).to_batches() - else: - parquet_batches_getter = parquet_scanner_or_path.to_batches() - for batch in parquet_batches_getter: - column_list.extend(batch.to_pydict()[column_name]) - - return column_list - - -def remove_common_duplicates(dataset_to_filter_path: str, reference_dataset_path: str, - duplicate_columns_name: str, tmp_path: str): - """ - Function that removes overlapping duplicates from a dataset with respect to a field (columns) - in a reference dataset - Args: - dataset_to_filter_path (str): the source to the dataset from which to filter out the - duplicates - reference_dataset_path (str): the reference dataset containing the duplicate ids - duplicate_columns_name (str): the reference columns where the duplicates exist - tmp_path (str): the temporary path where to write the temporary appended parquet file - """ - index_dataset = get_column_list_from_parquet( - parquet_scanner_or_path=dataset_to_filter_path, - column_name=duplicate_columns_name) - index_reference_dataset = get_column_list_from_parquet( - parquet_scanner_or_path=reference_dataset_path, - column_name=duplicate_columns_name) - # Pyarrow does not have a "not_in" filter we have to find the non overlapping column elements - # between the two datasets - non_overlapping_indices = list(set(index_dataset) ^ set(index_reference_dataset)) - nb_duplicates = ((len(index_dataset) + len(index_reference_dataset)) - len( - non_overlapping_indices)) / 2 - # Construct parquet filters and filter based on the criteria - filters = (pc.field(duplicate_columns_name).isin(non_overlapping_indices)) - - file_name = f"{io_helpers.get_file_name(dataset_to_filter_path)}_tmp.parquet" - tmp_parquet_path = os.path.join(tmp_path, file_name) - - filtered_dataset = filter_parquet_file(file_path=dataset_to_filter_path, filters=filters) - - # Write the new filtered parquet file - with pq.ParquetWriter(tmp_parquet_path, schema=filtered_dataset.dataset_schema) as writer: - for batch in filtered_dataset.to_batches(): - writer.write_batch(batch) - - os.replace(tmp_parquet_path, dataset_to_filter_path) - - LOGGER.info("Number of removed duplicates from %s was %s", dataset_to_filter_path, - nb_duplicates) - - -def write_index_parquet(index_parquet_path: str, data_iterable_producer: Callable, **kwargs): - """ - Function that writes the index id parquet information to a parquet file - Args: - - index_parquet_path (str): the path to the index id parquet file - data_iterable_producer (Callable): an iterable function that returns a tuple and is - used to return the data to write (in chunks) - """ - - # Define index id schema - schema = pa.schema([ - pa.field('index', pa.string())], - metadata={"description": "Parquet file containing the list of index ids of images" - "in the format _"}) - - write_parquet_file(parquet_path=index_parquet_path, - data_iterable=data_iterable_producer(**kwargs), - schema=schema) - - LOGGER.info("index id parquet file written to %s.", index_parquet_path) - - -def write_dataset_parquet(dataset_parquet_path: str, data_iterable_producer: Callable, **kwargs): - """ - Function that dataset parquet information to a parquet file - Args: - dataset_parquet_path (str): the path to the dataset parquet file - data_iterable_producer (Callable): an iterable function that returns a tuple and - is used to return the data to write (in chunks) - """ - - schema = pa.schema([ - pa.field('file_uri', pa.string()), - pa.field('file_id', pa.string()), - pa.field('file_size', pa.int32()), - pa.field('file_extension', pa.string()) - ], - metadata={"description": "Parquet file containing info of the image dataset (id," - "uri, size, extension)"}) - - write_parquet_file(parquet_path=dataset_parquet_path, - data_iterable=data_iterable_producer(**kwargs), - schema=schema) - - LOGGER.info("dataset parquet file written to %s.", dataset_parquet_path) - - -def write_captions_parquet(caption_parquet_path: str, data_iterable_producer: Callable, **kwargs): - """ - Function that dataset parquet information to a parquet file - Args: - caption_parquet_path (str): the path to the caption parquet file - data_iterable_producer (Callable): an iterable function that returns a tuple and is used to - return the data to write (in chunks) - """ - - schema = pa.schema([ - pa.field('file_id', pa.string()), - pa.field('file_uri', pa.string()), - pa.field('file_captions', pa.list_(pa.string())) - ], - metadata={"description": "Parquet file containing info of the image dataset (id," - "uri, size, extension)"}) - - write_parquet_file(parquet_path=caption_parquet_path, - data_iterable=data_iterable_producer(**kwargs), - schema=schema) - - LOGGER.info("dataset captions file written to %s.", caption_parquet_path) - - -def write_clip_retrieval_parquet(clip_retrieval_parquet_path: str, - data_iterable_producer: Callable, **kwargs): - """ - Function that dataset parquet information to a parquet file - Args: - clip_retrieval_parquet_path (str): the path to the clip retrieval parquet file - data_iterable_producer (Callable): an iterable function that returns a tuple and is used to - return the data to write (in chunks) - """ - - schema = pa.schema([ - pa.field('id', pa.int64()), - pa.field('url', pa.string()) - ], - metadata={"description": "Parquet file containing info of the clip retrieval job"}) - - write_parquet_file(parquet_path=clip_retrieval_parquet_path, - data_iterable=data_iterable_producer(**kwargs), - schema=schema) - - LOGGER.info("clip retrieval parquet path file written to %s.", clip_retrieval_parquet_path) - - -def insert_metadata(file_name: str, file_path: str, custom_metadata: Dict[any, any]): - """ - Update metadata of parquet file - Args: - file_name (str): the name of the parquet file - file_path (str): the path to the parquet file - custom_metadata (Dict[any, any]): the new metadata to update - """ - file_path = os.path.join(file_path, file_name) - parquet_table = pa.parquet.read_table(file_path) - existing_metadata = parquet_table.schema.metadata - merged_metadata = {**custom_metadata, **existing_metadata} - parquet_table = parquet_table.replace_schema_metadata(merged_metadata) - pq.write_table(parquet_table, file_path) - - -def filter_parquet_file(file_path: str, filters: Expression, - batch_size: Optional[int] = 10_000) -> Scanner: - """ - Function that loads in a parquet file and filters it - Args: - file_path (str): the path to the parquet file - filters (Expression): a collection of filters to apply. See - "https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Expression.html" - for more details - batch_size (int): the batch size of the scanner - Returns: - Scanner: scanned dataset - """ - dataset = ds.dataset(file_path).scanner(filter=filters, batch_size=batch_size) - - return dataset - - -def get_nb_rows_from_parquet(parquet_dataset_or_path: Union[str, pq.ParquetDataset]) -> int: - """ - Function that returns the number of rows of a parquet file - Args: - parquet_dataset_or_path: Union[str, pq.ParquetDataset]: a path to a parquet file or the - parquet dataset - Returns: - int: the total number of rows - """ - if isinstance(parquet_dataset_or_path, str): - dataset = pq.ParquetDataset(parquet_dataset_or_path, use_legacy_dataset=False) - else: - dataset = parquet_dataset_or_path - return sum([fragment.metadata.num_rows for fragment in dataset.fragments]) diff --git a/mlpipelines/components/base_component/helpers/storage_helpers.py b/mlpipelines/components/base_component/helpers/storage_helpers.py deleted file mode 100644 index ba073e52a..000000000 --- a/mlpipelines/components/base_component/helpers/storage_helpers.py +++ /dev/null @@ -1,215 +0,0 @@ -""" -Write and read files from Google cloud storage -""" -import hashlib -import subprocess # nosec -import logging -import os -from urllib.parse import urlparse -from typing import Optional, List, Tuple - -from google.cloud import storage - -# pylint: disable=import-error -from helpers.logger import get_logger - -LOGGER = get_logger(name=__name__, level=logging.INFO) - - -def unique_name_from_str(str_to_encode: str, last_idx: int = 12) -> str: - """ - Function that generates a unique id name from a given string - Args: - str_to_encode (str): the string to encode - last_idx (int): the number of characters of the unique id - Returns: - str: the encoded string - """ - string = str_to_encode.encode('utf-8') - unique_name = hashlib.sha384(string).hexdigest()[0:last_idx] - return unique_name - - -def decode_gcs_url(gcs_path: str) -> Tuple[str, str]: - """ - Function that decodes a GCS path to a bucket and blob path - (i.e. file path) - Args: - gcs_path (str): the GCS URL (starting with https://) or URI (starting with gs://) - Returns: - Tuple[str, str]: a tuple containing the bucket and blob path - """ - parsed_url = urlparse(gcs_path) - if parsed_url.scheme == 'gs': - bucket, blob_path = parsed_url.hostname, parsed_url.path[1:] - else: - path = parsed_url.path[1:].split('/', 1) - bucket, blob_path = path[0], path[1] - return bucket, blob_path - - -def get_blob_list(storage_client: storage.Client, bucket_name: str, - prefix: Optional[str] = None) -> List[storage.Blob]: - """ - Function that returns all the blobs in a bucket. - Optionally you can pass a within a given prefix (i.e. folder path) to only list blobs within - that prefix - Args: - storage_client (storage.Client): the gcs storage client - bucket_name (str): the name of the bucket - prefix (Optional[str]): the prefix of the bucket. If not specified, only the blobs within - the given bucket will be listed - Returns: - List[storage.Blob]: a list of blob objects - """ - - blobs_iterator = storage_client.list_blobs(bucket_name, prefix=prefix) - blob_list = [blob for blob in blobs_iterator if blob.size != 0] - return blob_list - - -def get_blob_metadata(storage_client: storage.Client, bucket_name: str, - prefix: Optional[str] = None, id_prefix: Optional[str] = None) -> tuple: - """ - Function that returns all the blobs in a bucket. - Optionally you can pass a within a given prefix (i.e. folder path) to only list blobs - within that prefix - Args: - storage_client (storage.Client): the gcs storage client - bucket_name (str): the name of the bucket - prefix (Optional[str]): the prefix of the bucket. If not specified, only the blobs within - the given bucket will be listed - id_prefix (Optional[str]): a prefix to add to the file id - Returns: - tuple: tuple containing relevant metadata - """ - - blob_list = get_blob_list(storage_client, bucket_name, prefix) - - for blob in blob_list: - # id taken from the blob id of format ('/') - blob_id = unique_name_from_str(str(blob.id)) - if id_prefix: - file_id = f'{id_prefix}_{blob_id}' - else: - file_id = blob_id - file_size = blob.size - file_extension = blob.content_type.rsplit('/')[1] - file_uri = f"gs://{bucket_name}/{blob.name}" - yield file_uri, file_id, file_size, file_extension - - -def get_blob_id(storage_client: storage.Client, bucket_name: str, - prefix: Optional[str] = None, id_prefix: Optional[str] = None) -> str: - """ - Function that returns blob id from a blob list - Args: - storage_client (storage.Client): the gcs storage client - bucket_name (str): the name of the bucket - prefix (Optional[str]): the prefix of the bucket. If not specified, only the blobs within - the given bucket will be listed - id_prefix (Optional[str]): a prefix to add to the file id - Returns: - tuple: tuple containing file id - """ - blob_list = get_blob_list(storage_client, bucket_name, prefix) - - for blob in blob_list: - # id taken from the blob id of format ('/') - blob_id = unique_name_from_str(str(blob.id)) - file_id = f'{id_prefix}_{blob_id}' - yield file_id - - -def download_file_from_bucket(storage_client: storage.Client, gcs_uri: str, - folder_to_download: str) -> str: - """ - Function that downloads a file from GCS to a local directory - Args: - storage_client (storage.Client): the gcs storage client - gcs_uri (str): the gcs path to download (URL or URI format) - folder_to_download (str): the local folder to download the file to - Returns: - str: the full path where the file was downloaded - """ - bucket, blob_path = decode_gcs_url(gcs_uri) - bucket = storage_client.bucket(bucket) - blob = bucket.blob(blob_path) - path_to_download = os.path.join(os.path.dirname(folder_to_download), - os.path.basename(blob_path)) - blob.download_to_filename(path_to_download) - - return path_to_download - - -def copy_files_bulk(filelist_source: str, destination_path: str): - """ - Copies files from a source path to a destination path sing parallel multi-threading - for increased efficiency - Args: - filelist_source (str): path to file list containing one-per-line list of GCS URIs or a list - of local paths to be copied - destination_path (str): Path to where the files should be copied (can be a local path or a - GCS URI). - """ - LOGGER.info("Copying files from %s to %s ", filelist_source, destination_path) - pipe_file_list = subprocess.Popen(["cat", filelist_source], stdout=subprocess.PIPE) # nosec - subprocess.call( # nosec - ['gsutil', '-o', '"GSUtil:use_gcloud_storage=True"', '-q', '-m', 'cp', '-I', - destination_path], stdin=pipe_file_list.stdout) - LOGGER.info("Copying files from %s to %s [DONE]", filelist_source, destination_path) - - -def copy_folder_bulk(source_path: str, destination_path: str): - """Copies a folder from a source path to a destination path sing parallel multi-threading - for increased efficiency - Args: - source_path (str): Path from where the files should be copied (can be a local path or a - GCS URI). - destination_path (str): Path to where the files should be copied (can be a local path or a - GCS URI). - """ - LOGGER.info("Copying folder from %s to %s ", source_path, destination_path) - subprocess.run( # nosec - ["gsutil", '-o', '"GSUtil:use_gcloud_storage=True"', '-q', "-m", "cp", "-r", source_path, - destination_path], - check=True) - LOGGER.info("Copying folder from %s to %s [DONE]", source_path, destination_path) - - -def upload_file_to_bucket(storage_client: storage.Client, file_to_upload_path: str, - bucket_name: str, blob_path: str) -> None: - """ - Upload file to bucket - Args: - storage_client (storage.Client): the gcs storage client - file_to_upload_path (str): path of file to upload to the bucket - bucket_name (str): name of the bucket to upload the data to - blob_path (str): the path to the blob to upload the data within the specified bucket - """ - - bucket = storage_client.bucket(bucket_name) - - blob = bucket.blob(blob_path) - blob.upload_from_filename(file_to_upload_path) - - LOGGER.info(" %s uploaded to GCS at gs://%s/%s'.", - file_to_upload_path, bucket_name, blob_path) - - -def copy_blob(storage_client: storage.Client, - bucket_name: str, blob_path: str, destination_bucket_name: str, - destination_blob_name: str): - """ - Function that copies a blob from one bucket to another - Args: - storage_client (storage.Client): the gcs storage client - bucket_name (str): the source bucket name - blob_path (str): the source blob name - destination_bucket_name (str): the destination bucket name - destination_blob_name (str): the destination blob name - """ - source_bucket = storage_client.bucket(bucket_name) - source_blob = source_bucket.blob(blob_path) - destination_bucket = storage_client.bucket(destination_bucket_name) - source_bucket.copy_blob(source_blob, destination_bucket, destination_blob_name) diff --git a/mlpipelines/components/base_component/requirements.txt b/mlpipelines/components/base_component/requirements.txt deleted file mode 100644 index 25a799f55..000000000 --- a/mlpipelines/components/base_component/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -dataclasses-json==0.5.7 -google-cloud-storage==1.44.0 -google-api-core==2.8.1 -kfp==1.8.14 -pandas==1.3.5 -pyarrow==10.0.0 -torch==1.12.0 diff --git a/mlpipelines/components/image_embedding_component/src/__init__.py b/mlpipelines/components/image_embedding_component/src/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/mlpipelines/components/image_embedding_component/src/utils/__init__.py b/mlpipelines/components/image_embedding_component/src/utils/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/mlpipelines/components/image_filter_component/__init__.py b/mlpipelines/components/image_filter_component/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/mlpipelines/components/image_filter_component/src/__init__.py b/mlpipelines/components/image_filter_component/src/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/mlpipelines/components/sd_finetuning_component/__init__.py b/mlpipelines/components/sd_finetuning_component/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/mlpipelines/components/sd_finetuning_component/src/__init__.py b/mlpipelines/components/sd_finetuning_component/src/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/mlpipelines/example_components/check_gpu_component/Dockerfile b/mlpipelines/example_components/check_gpu_component/Dockerfile deleted file mode 100644 index 359c56116..000000000 --- a/mlpipelines/example_components/check_gpu_component/Dockerfile +++ /dev/null @@ -1,13 +0,0 @@ -FROM europe-west1-docker.pkg.dev/storied-landing-366912/storied-landing-366912-default-repository/mlpipelines/kubeflow/components/base_component:latest - -# install requirements -COPY requirements.txt / -RUN pip3 install -r /requirements.txt - -# Set the working directory to the source folder -WORKDIR /src - -# Copy over src-files of the component -COPY src /src - -ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/mlpipelines/example_components/check_gpu_component/README.md b/mlpipelines/example_components/check_gpu_component/README.md deleted file mode 100644 index 395b8b228..000000000 --- a/mlpipelines/example_components/check_gpu_component/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# example_component - -### Description - -This is an example of a Kubeflow Pipelines Reusable Component. -Kubeflow components are the steps in your ML Pipelines. -They are self-contained pieces of code that are analogous to functions, with a name, inputs and outputs, and a body of logic. -A conceptual overview of components in Kubeflow can be found in the [Kubeflow Documentation](https://www.kubeflow.org/docs/pipelines/overview/concepts/component/) - -A component comprises of: -1. src/main.py -2. component.yaml -3. Dockerfile -4. build_image.sh - -# src/main.py -The main.py file contains the main logic of our component as well as the argument parsing. -To update this file for your usecase, you need only update the logic within the function `component()`, adding any parameters if needed. -You can update the function's name but if you do so, don't forget to update the reference in the `__main__` call on line 43. -If you add parameters to this component, don't forget to update the argument references in the component.yaml file. -You can also specify the component function in a separate python file from main.py, provided you include the import statement in the main.py. - -# component.yaml -This YAML files contains the specification for the component. It includes details of the component's input and output parametes, as well as a reference to the Container Image. -In this implementation, the Container Image reference in this file is updated by a sed command in the build_image.sh script, so will be automatically updated when the Compoenent Image is built. -Pipelines that want to use this component are pointed to the Component's YAML file to know which image to use in the pipeline, and what parameters the componenet expects. - -# Dockerfile -This file is standard for building Docker images. It contains all of the commands used to assemble a Docker Image and is used to automate these steps in sequence. -If your Component relies on extra, perhaps custom, modules of code that needs to be included with your Component Code, you can add a COPY statement below that copying the 'src/' folder on line 8. -This will make sure that when the image is built, the necessary code is also added. - -# build_image.sh -This file automates the process of building the Docker Images for you. It takes parameters for project_id and image_tag, and will use these to generate an GCR path and image name. -The generated image name will be inserted into the Component's component.yaml file. Finally, it will build, tag, and push the image to the Container Repository. - -For more details regarding building custom reusable components with Kubeflow Pipelines, please see the [Kubeflow Documentation](https://www.kubeflow.org/docs/pipelines/sdk/component-development/) - - diff --git a/mlpipelines/example_components/check_gpu_component/__init__.py b/mlpipelines/example_components/check_gpu_component/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/mlpipelines/example_components/check_gpu_component/build_image.sh b/mlpipelines/example_components/check_gpu_component/build_image.sh deleted file mode 100644 index 5b4bc5eaf..000000000 --- a/mlpipelines/example_components/check_gpu_component/build_image.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -e -# Copyright 2018 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Read component configs -. ../../components.config - -# Set some variables -ARTIFACT_PATH=${PROJECT_ARTIFACT_PATH} -IMAGE_NAME="components/${PWD##*/}" -IMAGE_TAG="latest" - -# Create full name of the image -FULL_IMAGE_NAME=${ARTIFACT_PATH}/${IMAGE_NAME}:${IMAGE_TAG} -echo $FULL_IMAGE_NAME - -gcloud builds submit --machine-type n1-highcpu-32 . -t "$FULL_IMAGE_NAME" diff --git a/mlpipelines/example_components/check_gpu_component/component.yaml b/mlpipelines/example_components/check_gpu_component/component.yaml deleted file mode 100644 index 1f3c5f16f..000000000 --- a/mlpipelines/example_components/check_gpu_component/component.yaml +++ /dev/null @@ -1,15 +0,0 @@ -name: check_gpu_component -description: A component that checks the utilization of GPU. - -inputs: - - name: project_id - description: The id of the gcp-project - type: String - -implementation: - container: - image: europe-west1-docker.pkg.dev/storied-landing-366912/storied-landing-366912-default-repository/mlpipelines/kubeflow/components/check_gpu_component:latest - command: [ - python3, main.py, - --project-id, {inputValue: project_id} - ] \ No newline at end of file diff --git a/mlpipelines/example_components/check_gpu_component/requirements.txt b/mlpipelines/example_components/check_gpu_component/requirements.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/mlpipelines/example_components/check_gpu_component/src/__init__.py b/mlpipelines/example_components/check_gpu_component/src/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/mlpipelines/example_components/check_gpu_component/src/main.py b/mlpipelines/example_components/check_gpu_component/src/main.py deleted file mode 100644 index 7b5603a44..000000000 --- a/mlpipelines/example_components/check_gpu_component/src/main.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -This file is the entrypoint of the component. It will parse all arguments -and give them to the actual core of the component. -""" -import argparse -import logging - -# pylint: disable=import-error -from helpers import kfp_helpers - -PARSER = argparse.ArgumentParser() -PARSER.add_argument('--project-id', - type=str, - required=True, - help='The id of the gcp-project') - - -def component(project_id: str) -> None: - """ - A basic component to check the GPU usage in KFP component - Args: - project_id (str): The id of the gcp-project - """ - - logging.info('Started job...') - - logging.info('Project ID: %s', project_id) - - # Show CUDA availability - kfp_helpers.get_cuda_availability() - - logging.info('Job completed.') - - -if __name__ == '__main__': - ARGS = PARSER.parse_args() - component(ARGS.project_id) diff --git a/mlpipelines/example_components/example_component/Dockerfile b/mlpipelines/example_components/example_component/Dockerfile deleted file mode 100644 index 4d41f8bc8..000000000 --- a/mlpipelines/example_components/example_component/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM python:3.7-slim - -# install requirements -COPY requirements.txt / -RUN python3.7 -m pip install --no-cache-dir -r requirements.txt - -# Copy over src-files of the component -COPY src /src - -# TODO: ADD EXTRA FILES HERE IF NEEDED! - -# Set the working directory to the source folder -WORKDIR /src - -ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/mlpipelines/example_components/example_component/README.md b/mlpipelines/example_components/example_component/README.md deleted file mode 100644 index 07ad410b7..000000000 --- a/mlpipelines/example_components/example_component/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# example_component - -### Description - -This is an example of a Kubeflow Pipelines Reusable Component. -Kubeflow components are the steps in your ML Pipelines. -They are self-contained pieces of code that are analogous to functions, with a name, inputs and outputs, and a body of logic. -A conceptual overview of components in Kubeflow can be found in the [Kubeflow Documentation](https://www.kubeflow.org/docs/pipelines/overview/concepts/component/) - -A component comprises of: -1. src/main.py -2. component.yaml -3. Dockerfile -4. build_image.sh - -# src/main.py -The main.py file contains the main logic of our component as well as the argument parsing. -To update this file for your usecase, you need only update the logic within the function `component()`, adding any parameters if needed. -You can update the function's name but if you do so, don't forget to update the reference in the `__main__` call on line 43. -If you add parameters to this component, don't forget to update the argument references in the component.yaml file. -You can also specify the component function in a separate python file from main.py, provided you include the import statement in the main.py. - -# component.yaml -This YAML files contains the specification for the component. It includes details of the component's input and output parametes, as well as a reference to the Container Image. -In this implementation, the Container Image reference in this file is updated by a sed command in the build_image.sh script, so will be automatically updated when the Compoenent Image is built. -Pipelines that want to use this component are pointed to the Component's YAML file to know which image to use in the pipeline, and what parameters the componenet expects. - -# Dockerfile -This file is standard for building Docker images. It contains all of the commands used to assemble a Docker Image and is used to automate these steps in sequence. -If your Component relies on extra, perhaps custom, modules of code that needs to be included with your Component Code, you can add a COPY statement below that copying the 'src/' folder on line 8. -This will make sure that when the image is built, the necessary code is also added. - -# build_image.sh -This file automates the process of building the Docker Images for you. It takes parameters for project_id and image_tag, and will use these to generate an GCR path and image name. -Finally, it will build, tag, and push the image to the Container Repository. - -For more details regarding building custom reusable components with Kubeflow Pipelines, please see the [Kubeflow Documentation](https://www.kubeflow.org/docs/pipelines/sdk/component-development/) - - diff --git a/mlpipelines/example_components/example_component/__init__.py b/mlpipelines/example_components/example_component/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/mlpipelines/example_components/example_component/build_image.sh b/mlpipelines/example_components/example_component/build_image.sh deleted file mode 100644 index 5b4bc5eaf..000000000 --- a/mlpipelines/example_components/example_component/build_image.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -e -# Copyright 2018 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Read component configs -. ../../components.config - -# Set some variables -ARTIFACT_PATH=${PROJECT_ARTIFACT_PATH} -IMAGE_NAME="components/${PWD##*/}" -IMAGE_TAG="latest" - -# Create full name of the image -FULL_IMAGE_NAME=${ARTIFACT_PATH}/${IMAGE_NAME}:${IMAGE_TAG} -echo $FULL_IMAGE_NAME - -gcloud builds submit --machine-type n1-highcpu-32 . -t "$FULL_IMAGE_NAME" diff --git a/mlpipelines/example_components/example_component/component.yaml b/mlpipelines/example_components/example_component/component.yaml deleted file mode 100644 index 8e105e039..000000000 --- a/mlpipelines/example_components/example_component/component.yaml +++ /dev/null @@ -1,19 +0,0 @@ -name: example_component -description: A basic component that takes a project ID as input - and writes it to an output file. -inputs: - - name: project_id - description: The id of the gcp-project - type: String - -outputs: - - name: project_id_file - description: Path to the local file containing the gcs path where the output has been stored -implementation: - container: - image: europe-west1-docker.pkg.dev/storied-landing-366912/storied-landing-366912-default-repository/mlpipelines/kubeflow/components/example_component:latest - command: [ - python3, main.py, - --project-id, {inputValue: project_id}, - --project-id-file, {outputPath: project_id_file}, - ] \ No newline at end of file diff --git a/mlpipelines/example_components/example_component/requirements.txt b/mlpipelines/example_components/example_component/requirements.txt deleted file mode 100644 index 15cd68e70..000000000 --- a/mlpipelines/example_components/example_component/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -# Add component requirements here \ No newline at end of file diff --git a/mlpipelines/example_components/example_component/src/__init__.py b/mlpipelines/example_components/example_component/src/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/mlpipelines/example_components/example_component/src/main.py b/mlpipelines/example_components/example_component/src/main.py deleted file mode 100644 index 10f9603b6..000000000 --- a/mlpipelines/example_components/example_component/src/main.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -This file is the entrypoint of the component. It will parse all arguments -and give them to the actual core of the component. -""" -import argparse -import logging -from pathlib import Path - - -PARSER = argparse.ArgumentParser() -PARSER.add_argument('--project-id', - type=str, - required=True, - help='The id of the gcp-project') -PARSER.add_argument('--project-id-file', - type=str, - required=True, - help='Path to the output file') -# TODO: Add arguments here! - - -def component(project_id: str, - project_id_file: str) -> None: - """ - A basic component that takes a project ID as input - and writes it to an output file. - Args: - project_id (str): The id of the gcp-project - project_id_file (str): Filename in which kubeflow will store Project ID - """ - - logging.info('Started job...') - # TODO: Add component logic here! - Path(project_id_file).parent.mkdir(parents=True, exist_ok=True) - Path(project_id_file).write_text(str(project_id)) - - logging.info('Job completed.') - - -if __name__ == '__main__': - ARGS = PARSER.parse_args() - component(ARGS.project_id, - ARGS.project_id_file) diff --git a/mlpipelines/example_components/example_component/src/test_manifest.parquet b/mlpipelines/example_components/example_component/src/test_manifest.parquet deleted file mode 100644 index 188cfd698..000000000 --- a/mlpipelines/example_components/example_component/src/test_manifest.parquet +++ /dev/null @@ -1 +0,0 @@ -{"index": {"location": "gs://storied-landing-366912-kfp-output/custom_artifact/test/test_component/index/index.parquet", "type": "parquet", "extensions": ["parquet"], "n_files": 1, "n_items": 4}, "data_sources": {"metadata": {"location": "gs://storied-landing-366912-kfp-output/custom_artifact/test/test_component/metadata/metadata.parquet", "type": "parquet", "extensions": ["parquet"], "n_files": 1, "n_items": 4}, "captions": {"location": "gs://storied-landing-366912-kfp-output/custom_artifact/test/test_component/captions/captions.parquet", "type": "parquet", "extensions": ["parquet"], "n_files": 1, "n_items": 4}}, "metadata": {"artifact_bucket": "storied-landing-366912-kfp-output", "run_id": "test", "component_id": "", "component_name": "test_component", "branch": "", "commit_hash": "", "creation_date": "01-03-2023_18-03-08", "num_items": 0}} \ No newline at end of file diff --git a/mlpipelines/pipelines/__init__.py b/mlpipelines/pipelines/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/mlpipelines/pipelines/example_pipeline.py b/mlpipelines/pipelines/example_pipeline.py deleted file mode 100644 index 7ab9314cb..000000000 --- a/mlpipelines/pipelines/example_pipeline.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -This module includes a lightweight component definition, -a reusable component import and a basic pipeline utilising -these elements/ -""" -from kfp import components as comp -from kfp import dsl - -from config.general_config import GeneralConfig, KubeflowConfig -from helpers.lightweight import read_input -from helpers.upload import compile_and_upload_pipeline - -# Load Component -example_component_op = comp.load_component( - '/home/philippe/Scripts/express-pipelines/mlpipelines/example_components/example_component/component.yaml') - -read_input_op = comp.func_to_container_op(func=read_input) - - -# Pipeline -@dsl.pipeline( - name='example pipeline', - description='Basic example of a Kubeflow Pipeline' -) -def example_pipeline(gcp_project_id=GeneralConfig.GCP_PROJECT_ID): - """ Basic pipeline to demonstrate lightweight components, - reusable components, a passing of values between components. - Takes a project ID string as argument and prints it out. - Args: - gcp_project_id (str): project ID string - """ - # pylint: disable=not-callable,unused-variable - - example_component_task = example_component_op( - # Component Arguments go here - project_id=gcp_project_id - ).set_display_name('example component') - - read_input_task = read_input_op( - # lightweight component arguments - # with example of passing outputs from previous - # component - input_v=example_component_task.outputs['project_id_file'] - ) - - -if __name__ == '__main__': - compile_and_upload_pipeline(pipeline=example_pipeline, - host=KubeflowConfig.HOST, - env=KubeflowConfig.ENV) diff --git a/mlpipelines/pipelines/helpers/__init__.py b/mlpipelines/pipelines/helpers/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/pylint-modules b/pylint-modules index 499e2fe21..06ee6a024 100644 --- a/pylint-modules +++ b/pylint-modules @@ -1,12 +1,10 @@ -mlpipelines/pipelines -mlpipelines/common -mlpipelines/example_components/check_gpu_component -mlpipelines/example_components/example_component -mlpipelines/components/base_component -mlpipelines/components/dataset_loader_component -mlpipelines/components/image_filter_component -mlpipelines/components/image_conversion_component -mlpipelines/components/image_embedding_component -mlpipelines/components/clip_retrieval_component -mlpipelines/components/clip_downloader_component -mlpipelines/components/image_caption_component \ No newline at end of file +express +examples/finetune_stable_diffusion/components/clip_downloader_component +examples/finetune_stable_diffusion/components/clip_retrieval_component +examples/finetune_stable_diffusion/components/dataset_loader_component +examples/finetune_stable_diffusion/components/image_caption_component +examples/finetune_stable_diffusion/components/image_classifier_component +examples/finetune_stable_diffusion/components/image_conversion_component +examples/finetune_stable_diffusion/components/image_embedding_component +examples/finetune_stable_diffusion/components/image_filter_component +examples/finetune_stable_diffusion/components/sd_finetuning_component \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 4f9b4dfab..681725f9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.poetry] -name = "Express" +name = "express" version = "0.1.dev0" description = "Express - Composable pipelines for foundation model finetuning" readme = "README.md" @@ -40,6 +40,8 @@ classifiers = [ [tool.poetry.dependencies] python = "^3.8" +dataclasses-json = "^0.5.7" +pandas = "^1.3.5" [tool.poetry.group.test.dependencies] bandit = { version = "^1.7.4", extras = ["toml"] } @@ -51,4 +53,4 @@ exclude_dirs = ["*/mlpipelines/components/sd_finetuning_component/src/train_text [build-system] requires = ["poetry-core>=1.2.0"] -build-backend = "poetry.core.masonry.api" +build-backend = "poetry.core.masonry.api" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 1353841d7..000000000 --- a/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -bandit[toml]==1.7.4 -pylint==2.5.3 -liccheck==0.7.3 -toml==0.10.2 \ No newline at end of file