Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More features around Beam. #731

Merged
merged 4 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/mlcroissant/mlcroissant/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Defines the public interface to the `mlcroissant` package."""

from mlcroissant._src import torch
from mlcroissant._src.beam import ReadFromCroissant
from mlcroissant._src.core import constants
from mlcroissant._src.core.constants import DataType
from mlcroissant._src.core.constants import EncodingFormat
Expand Down Expand Up @@ -44,6 +45,7 @@
"Organization",
"Person",
"Rdf",
"ReadFromCroissant",
"Records",
"RecordSet",
"Source",
Expand Down
72 changes: 72 additions & 0 deletions python/mlcroissant/mlcroissant/_src/beam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""Beam module."""

from __future__ import annotations

from collections.abc import Mapping
import typing
from typing import Any

from etils import epath

from mlcroissant._src.datasets import Dataset

if typing.TYPE_CHECKING:
import apache_beam as beam


def ReadFromCroissant(
*,
pipeline: beam.Pipeline,
jsonld: epath.PathLike | Mapping[str, Any],
record_set: str,
mapping: Mapping[str, epath.PathLike] | None = None,
):
"""Returns an Apache Beam reader to generate the dataset using e.g. Spark.

Example of usage:

```python
import apache_beam as beam
from apache_beam.options import pipeline_options
import mlcroissant as mlc

jsonld = "https://huggingface.co/api/datasets/ylecun/mnist/croissant"

options = pipeline_options.PipelineOptions()
with beam.Pipeline(options=options) as pipeline:
mlc.ReadFromCroissant(
pipeline=pipeline,
jsonld=jsonld,
record_set="default",
)
```

Only streamable datasets can be used with Beam. A streamable dataset is a dataset
that can be generated by a linear sequence of operations - without joins for
example. This is the case for Hugging Face datasets. If there are branches, we'd
need a more complex Beam pipeline.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So IIUC we will change this in future work to work with non-streamable datasets (e.g. future versions of HF croissants) -- I feel if this is right, this would deserve a mention here, or a link to an issue?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


TODO(https://github.com/mlcommons/croissant/issues/733): handle branches.

The sharding is done on the filtered files. This is currently optimized for Hugging
Face datasets, so it raises an error if the dataset is not a Hugging Face dataset.

Args:
pipeline: A Beam pipeline.
jsonld: A JSON object or a path to a Croissant file (URL, str or pathlib.Path).
record_set: The name of the record set to generate.
mapping: Mapping filename->filepath as a Python dict[str, str] to handle manual
downloads. If `document.csv` is the FileObject and you downloaded it to
`~/Downloads/document.csv`, you can specify `mapping={"document.csv":
"~/Downloads/document.csv"}`.

Returns:
A Beam PCollection with all the records where each element contains a tuple with
a) a global index, and
b) the content of the record.

Raises:
A ValueError if the dataset is not streamable.
"""
dataset = Dataset(jsonld=jsonld, mapping=mapping)
return dataset.records(record_set).beam_reader(pipeline)
36 changes: 1 addition & 35 deletions python/mlcroissant/mlcroissant/_src/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,41 +168,7 @@ def __iter__(self):
)

def beam_reader(self, pipeline: beam.Pipeline):
"""Returns an Apache Beam reader to generate the dataset using e.g. Spark.

Example of usage:

```python
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
import mlcroissant as mlc

dataset = mlc.Dataset(
jsonld="https://huggingface.co/api/datasets/ylecun/mnist/croissant",
)
pipeline_options = PipelineOptions()
with beam.Pipeline(options=pipeline_options) as pipeline:
_ = dataset.records("mnist").beam_reader(pipeline)
```

Only streamable datasets can be used with Beam. A streamable dataset is a
dataset that can be generated by a linear sequence of operations - without joins
for example. This is the case for Hugging Face datasets. If there are branches,
we'd need a more complex Beam pipeline.

The sharding is done on the filtered files. This is currently optimized for
Hugging Face datasets, so it raises an error if the dataset is not a Hugging
Face dataset.

Args:
A Beam pipeline.

Returns:
A Beam PCollection with all the records.

Raises:
A ValueError if the dataset is not streamable.
"""
"""See ReadFromCroissant docstring."""
operations = self._filter_interesting_operations(self.filters)
execute_downloads(operations)
if not _is_streamable_dataset(operations):
Expand Down
2 changes: 2 additions & 0 deletions python/mlcroissant/mlcroissant/_src/datasets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ def _equal_to_set(expected):
"""Checks whether 2 beam.PCollections are equal as sets."""

def matcher_fn(actual):
# Sort by index, then remove the index from the PCollection returned by Beam:
actual = [element for _, element in sorted(list(actual))]
expected_set = set([
json.dumps(record_to_python(element)) for element in list(expected)
])
Expand Down
72 changes: 67 additions & 5 deletions python/mlcroissant/mlcroissant/_src/operation_graph/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
from __future__ import annotations

import collections
from collections.abc import Iterable
import concurrent.futures
import functools
import sys
import typing
from typing import Any
from typing import Any, Generator

from absl import logging
import networkx as nx
Expand All @@ -22,6 +25,8 @@
if typing.TYPE_CHECKING:
import apache_beam as beam

ElementWithIndex = tuple[int, Any]


def execute_downloads(operations: Operations):
"""Executes all the downloads in the graph of operations."""
Expand Down Expand Up @@ -137,7 +142,7 @@ def read_all_files():
def execute_operations_in_beam(
pipeline: beam.Pipeline, record_set: str, operations: Operations
):
"""See beam_reader docstring."""
"""See ReadFromCroissant docstring."""
import apache_beam as beam

list_of_operations = _order_relevant_operations(operations, record_set)
Expand All @@ -149,12 +154,69 @@ def execute_operations_in_beam(
files = operation(files)
if isinstance(operation, FilterFiles):
break
pipeline = pipeline | "Shard by files" >> beam.Create(files)
if not isinstance(files, Iterable):
raise ValueError("Could not filter files.")
files = list(files) # even for large datasets, this can be handled in RAM.

# We first shard by file and assign a shard_index.
pipeline = pipeline | "Shard by files with index" >> beam.Create(enumerate(files))
num_shards = len(files)

# We don't know in advance the number of records per shards. So we just allocate the
# maximum number which is `sys.maxsize // num_shards`. Taking the practical case of
# large evenly distributed datasets on HuggingFace, we can compute the following:

# num_shards = number of Parquet files per config on Hugging Face < 10 billion files
# max_shard_size ~ 1 billion records per Parquet files

# So it seems we can run with this trick without too many problems. We still trigger
# a ValueError below if the error arises, and we ask the user to open a bug. A real
# solution to this problem would be to compute the shard_sizes in parallel of
# generating the records.
# TODO(https://github.com/mlcommons/croissant/issues/732): Compute shard_sizes
# explicitly instead of relying on max_shard_size.
max_shard_size = sys.maxsize // num_shards
while queue_of_operations:
operation = queue_of_operations.popleft()
if isinstance(operation, ReadFields):
beam_operation = beam.ParDo(operation)
beam_operation = beam.ParDo(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are 100% sure that only ReadFields operations can be leaves?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes!

But it's more linked to the fact that ReadFields is a generator.

functools.partial(
_add_global_index,
operation=operation,
max_shard_size=max_shard_size,
)
)
else:
beam_operation = beam.Map(operation)
beam_operation = beam.Map(
functools.partial(_pass_index, operation=operation)
)
pipeline |= beam_operation
return pipeline


def _add_global_index(
element_with_index: ElementWithIndex,
operation: Operation,
max_shard_size: int,
) -> Generator[ElementWithIndex, None, None]:
"""Computes the global index given the shard size."""
shard_index, element = element_with_index
for index_in_shard, result in enumerate(operation(element)):
if index_in_shard >= max_shard_size:
raise ValueError(
"WARNING: This was very unlikely, but it seems we just hit this limit"
" in the code. Find another way to optimize execute_operations_in_beam."
" Please, open a PR on GitHub to make the maintainers aware of this"
" issue. A fix is to compute the actual shard_sizes rather than relying"
" on a heuristic (see comments above in code)."
)
new_index = max_shard_size * shard_index + index_in_shard
yield (new_index, result)


def _pass_index(
element_with_index: tuple[int, Any], operation: Operation
) -> ElementWithIndex:
"""Passes the index to the next operation while executing the operation."""
index, element = element_with_index
return (index, operation(element))
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,8 @@ def there_exists_at_least_one_property(self, possible_properties: list[str]):
return False

def __hash__(self):
"""Hashes all immutable arguments."""
return hash(self.uuid)
"""Re-uses parent's hash function (i.e., object)."""
return super().__hash__()

def __eq__(self, other: Any) -> bool:
"""Compares two Nodes given their arguments."""
Expand All @@ -330,31 +330,6 @@ def __deepcopy__(self, memo):
memo[id(self)] = copy
return copy

def __reduce__(self):
"""Allows pickling the node.

`self.ctx` is stored separately in the state because it's not pickable directly.
"""
state = self.__getstate__()
args = tuple(state.values())
return (self.__class__, args, {"ctx": self.ctx})

def __getstate__(self):
"""Overwrites __getstate__ for pickling."""
state = {}
for field in dataclasses.fields(self):
if field.name == "ctx":
ctx = Context()
ctx.graph = self.ctx.graph
state[field.name] = ctx
else:
state[field.name] = getattr(self, field.name)
return state

def __setstate__(self, state):
"""Overwrites __setstate__ for pickling."""
self.ctx = state["ctx"]

def to_json(self) -> Json:
"""Converts the Python class to JSON."""
cls = self.__class__
Expand Down
Loading