Skip to content

Commit

Permalink
Add HF Dataset components (#8)
Browse files Browse the repository at this point in the history
This PR adds boilerplate `HFDatasetLoaderComponent` and
`HFDatasetTransformComponent` classes.
  • Loading branch information
PhilippeMoussalli committed Mar 10, 2023
1 parent 4dac264 commit 242af4c
Show file tree
Hide file tree
Showing 7 changed files with 234 additions and 5 deletions.
19 changes: 16 additions & 3 deletions examples/components/single_node/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,24 @@ with the `poetry install` command from the root of the repository.

# Usage

Run with the following commad:
## Pandas

Run the following command to upload a Pandas dataframe to a Google Cloud Storage bucket:

```
python3 data_loading_pandas.py --extra-args '{"project_id": "storied-landing-366912"}' \
--output-manifest <local_path> \
--metadata-args '{"run_id":"test","component_name":"test_component", \
"artifact_bucket":"storied-landing-366912-kfp-output"}'
```

## Hugging Face Datasets

Run the following command to load a Hugging Face Dataset from the [hub](https://huggingface.co/) and upload it to a Google Cloud Storage bucket:

```
python3 data_loading.py --extra-args '{"project_id": "storied-landing-366912"}' \
python3 data_loading_hf_datasets.py --extra-args '{"project_id": "soy-audio-379412"}' \
--output-manifest <local_path> \
--metadata-args '{"run_id":"test","component_name":"test_component", \
"artifact_bucket":"storied-landing-366912-kfp-output"}
"artifact_bucket":"soy-audio-379412_kfp-artifacts"}'
```
46 changes: 46 additions & 0 deletions examples/components/single_node/data_loading_hf_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from typing import Optional, Dict, Union

import pandas as pd
from datasets import Dataset, load_dataset, concatenate_datasets

from express.components.hf_datasets_components import HFDatasetsLoaderComponent, HFDatasetsDatasetDraft
from express.logger import configure_logging


# pylint: disable=too-few-public-methods
class SeedDatasetLoader(HFDatasetsLoaderComponent):
"""Class that inherits from Hugging Face data loading """

@classmethod
def load(cls, extra_args: Optional[
Dict[str, Union[str, int, float, bool]]] = None) -> HFDatasetsDatasetDraft:
"""
An example function showcasing the data loader component using Express functionalities
Args:
extra_args (Optional[Dict[str, Union[str, int, float, bool]]): optional args to pass to
the function (e.g. seed data source)
Returns:
HFDatasetsDatasetDraft: a dataset draft that creates a plan for an output datasets/manifest
"""
configure_logging()

# 1) Create example data source(s)
caption_dataset = load_dataset("lambdalabs/pokemon-blip-captions", split="train")

# 2) Create an example index
index_list = [f"image_{idx}" for idx in range(len(caption_dataset))]
index_df = pd.DataFrame(index_list, columns=['index'])
index = Dataset.from_pandas(index_df)

caption_dataset = concatenate_datasets([index, caption_dataset])

# 2.2) Create data_source dictionary
data_sources = {"captions": caption_dataset}

# 3) Create dataset draft from index and additional data sources
dataset_draft = HFDatasetsDatasetDraft(index=index, data_sources=data_sources)
return dataset_draft


if __name__ == '__main__':
SeedDatasetLoader.run()
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def load(cls, extra_args: Optional[
'size': [300, 400, 500, 600],
'format': ['jpeg', 'jpeg', 'jpeg', 'jpeg']}
df_metadata = pd.DataFrame(metadata).set_index('index')
# 2.1.2) Caption]
# 2.1.2) Caption
captions = {'index': index_list,
'captions': ['dog', 'cat', 'bear', 'duck']}
df_captions = pd.DataFrame(captions).set_index('index')
Expand Down
160 changes: 160 additions & 0 deletions express/components/hf_datasets_components.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""Hugging Face Datasets single component module """

import os
import importlib
import tempfile
from abc import ABC, abstractmethod
from typing import List, Optional, Dict, Union

import datasets
from datasets import load_dataset

from express.storage_interface import StorageHandlerModule
from express.manifest import DataManifest, DataSource, DataType
from .common import (
ExpressDatasetHandler,
ExpressDataset,
ExpressTransformComponent,
ExpressDatasetDraft,
ExpressLoaderComponent,
)


# Define interface of pandas draft
# pylint: disable=unsubscriptable-object
HFDatasetsDatasetDraft = ExpressDatasetDraft[List[str], datasets.Dataset]

# pylint: disable=no-member
STORAGE_MODULE_PATH = StorageHandlerModule().to_dict()[
os.environ.get("CLOUD_ENV", "GCP")
]
STORAGE_HANDLER = importlib.import_module(STORAGE_MODULE_PATH).StorageHandler()


# pylint: disable=too-few-public-methods
class HFDatasetsDataset(
ExpressDataset[List[str], Union[datasets.Dataset, datasets.DatasetDict]]
):
"""Hugging Face Datasets dataset"""

def load_index(self) -> datasets.Dataset:
"""Function that loads in the index"""
with tempfile.TemporaryDirectory() as tmp_dir:
local_parquet_path = STORAGE_HANDLER.copy_file(
self.manifest.index.location, tmp_dir
)

# we specify "train" here to get a `Dataset` instead of a `DatasetDict`
dataset = load_dataset(
"parquet", data_files=local_parquet_path, split="train"
)

return dataset

@staticmethod
def _load_data_source(
data_source: DataSource, index_filter: datasets.Dataset
) -> Union[datasets.Dataset, datasets.DatasetDict]:
"""Function that loads in a data source"""
if data_source.type != DataType.parquet:
raise TypeError("Only reading from parquet is currently supported.")

with tempfile.TemporaryDirectory() as tmp_dir:
data_source_location = data_source.location

local_parquet_path = STORAGE_HANDLER.copy_parquet(
data_source_location, tmp_dir
)

data_source_hf_datasets = load_dataset(
"parquet", data_dir=local_parquet_path
)

if index_filter:
index = index_filter["index"]
return data_source_hf_datasets.select([index])

return data_source_hf_datasets


class HFDatasetsDatasetHandler(ExpressDatasetHandler[List[str], datasets.Dataset]):
"""Hugging Face Datasets Dataset handler"""

@staticmethod
def _upload_parquet(
data: Union[datasets.Dataset, datasets.DatasetDict], name: str, remote_path: str
) -> DataSource:
with tempfile.TemporaryDirectory() as temp_folder:
# TODO: uploading without writing to temp file
# TODO: sharded parquet? not sure if we should shard the index or only the data sources
dataset_path = f"{temp_folder}/{name}.parquet"

data.to_parquet(path_or_buf=dataset_path)

fully_qualified_blob_path = f"{remote_path}/{name}.parquet"
STORAGE_HANDLER.copy_file(
source_file=dataset_path, destination=fully_qualified_blob_path
)

return DataSource(
location=fully_qualified_blob_path,
type=DataType.PARQUET,
extensions=["parquet"],
n_files=1,
n_items=len(data),
)

@classmethod
def _upload_index(cls, index: datasets.Dataset, remote_path: str) -> DataSource:
data_source = cls._upload_parquet(
data=index, name="index", remote_path=remote_path
)
return data_source

@classmethod
def _upload_data_source(
cls,
name: str,
data: Union[datasets.Dataset, datasets.DatasetDict],
remote_path: str,
) -> DataSource:
data_source = cls._upload_parquet(data=data, name=name, remote_path=remote_path)
return data_source

@classmethod
def _load_dataset(cls, input_manifest: DataManifest) -> HFDatasetsDataset:
return HFDatasetsDataset(input_manifest)


class HFDatasetsTransformComponent(
HFDatasetsDatasetHandler,
ExpressTransformComponent[List[str], datasets.Dataset],
ABC,
):
"""
Hugging Face Datasets dataset transformer. Subclass this class to define custom
transformation function
"""

@classmethod
@abstractmethod
def transform(
cls,
data: HFDatasetsDataset,
extra_args: Optional[Dict[str, Union[str, int, float, bool]]] = None,
) -> HFDatasetsDatasetDraft:
"""Transform dataset"""


class HFDatasetsLoaderComponent(
HFDatasetsDatasetHandler, ExpressLoaderComponent[List[str], datasets.Dataset], ABC
):
"""Hugging Face Datasets dataset loader. Subclass this class to define custom
transformation function"""

@classmethod
@abstractmethod
def load(
cls, extra_args: Optional[Dict[str, Union[str, int, float, bool]]] = None
) -> HFDatasetsDatasetDraft:
"""Load initial dataset"""
2 changes: 1 addition & 1 deletion express/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class DataType(str, Enum):
@classmethod
def is_valid(cls, data_type: str) -> bool:
"""Check if data type is valid"""
return data_type in cls._member_names_
return data_type in cls.__members__.values()


@dataclass
Expand Down
9 changes: 9 additions & 0 deletions license_strategy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,14 @@ authorized_licenses:
3-clause BSD
new BSD
MIT
GNU Library or Lesser General Public License (LGPL)
GNU Lesser General Public License v2 (LGPLv2)
GNU Lesser General Public License v2 or later (LGPLv2+)
GNU Lesser General Public License v3 (LGPLv3)
GNU Lesser General Public License v3 or later (LGPLv3+)
Mozilla Public License 1.0 (MPL)
Mozilla Public License 1.1 (MPL 1.1)
Mozilla Public License 2.0 (MPL 2.0)
The Unlicense (Unlicense)

[Authorized Packages]
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ classifiers = [
python = "^3.8"
dataclasses-json = "^0.5.7"
pandas = "^1.3.5"
datasets = "2.10.1"

[tool.poetry.group.test.dependencies]
liccheck = "^0.7.3"
Expand Down

0 comments on commit 242af4c

Please sign in to comment.