mlcommons · marcenacp · Sep 5, 2024 · Sep 4, 2024 · Sep 4, 2024 · Sep 4, 2024
@@ -1,6 +1,7 @@
 """Defines the public interface to the `mlcroissant` package."""
 
 from mlcroissant._src import torch
+from mlcroissant._src.beam import ReadFromCroissant
 from mlcroissant._src.core import constants
 from mlcroissant._src.core.constants import DataType
 from mlcroissant._src.core.constants import EncodingFormat
@@ -44,6 +45,7 @@
     "Organization",
     "Person",
     "Rdf",
+    "ReadFromCroissant",
     "Records",
     "RecordSet",
     "Source",

@@ -0,0 +1,72 @@
+"""Beam module."""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+import typing
+from typing import Any
+
+from etils import epath
+
+from mlcroissant._src.datasets import Dataset
+
+if typing.TYPE_CHECKING:
+    import apache_beam as beam
+
+
+def ReadFromCroissant(
+    *,
+    pipeline: beam.Pipeline,
+    jsonld: epath.PathLike | Mapping[str, Any],
+    record_set: str,
+    mapping: Mapping[str, epath.PathLike] | None = None,
+):
+    """Returns an Apache Beam reader to generate the dataset using e.g. Spark.
+
+    Example of usage:
+
+    ```python
+    import apache_beam as beam
+    from apache_beam.options import pipeline_options
+    import mlcroissant as mlc
+
+    jsonld = "https://huggingface.co/api/datasets/ylecun/mnist/croissant"
+
+    options = pipeline_options.PipelineOptions()
+    with beam.Pipeline(options=options) as pipeline:
+        mlc.ReadFromCroissant(
+            pipeline=pipeline,
+            jsonld=jsonld,
+            record_set="default",
+        )
+    ```
+
+    Only streamable datasets can be used with Beam. A streamable dataset is a dataset
+    that can be generated by a linear sequence of operations - without joins for
+    example. This is the case for Hugging Face datasets. If there are branches, we'd
+    need a more complex Beam pipeline.
+
+    TODO(https://github.com/mlcommons/croissant/issues/733): handle branches.
+
+    The sharding is done on the filtered files. This is currently optimized for Hugging
+    Face datasets, so it raises an error if the dataset is not a Hugging Face dataset.
+
+    Args:
+        pipeline: A Beam pipeline.
+        jsonld: A JSON object or a path to a Croissant file (URL, str or pathlib.Path).
+        record_set: The name of the record set to generate.
+        mapping: Mapping filename->filepath as a Python dict[str, str] to handle manual
+            downloads. If `document.csv` is the FileObject and you downloaded it to
+            `~/Downloads/document.csv`, you can specify `mapping={"document.csv":
+            "~/Downloads/document.csv"}`.
+
+    Returns:
+        A Beam PCollection with all the records where each element contains a tuple with
+            a) a global index, and
+            b) the content of the record.
+
+    Raises:
+        A ValueError if the dataset is not streamable.
+    """
+    dataset = Dataset(jsonld=jsonld, mapping=mapping)
+    return dataset.records(record_set).beam_reader(pipeline)
@@ -168,41 +168,7 @@ def __iter__(self):
             )
 
     def beam_reader(self, pipeline: beam.Pipeline):
-        """Returns an Apache Beam reader to generate the dataset using e.g. Spark.
-
-        Example of usage:
-
-        ```python
-        import apache_beam as beam
-        from apache_beam.options.pipeline_options import PipelineOptions
-        import mlcroissant as mlc
-
-        dataset = mlc.Dataset(
-            jsonld="https://huggingface.co/api/datasets/ylecun/mnist/croissant",
-        )
-        pipeline_options = PipelineOptions()
-        with beam.Pipeline(options=pipeline_options) as pipeline:
-            _ = dataset.records("mnist").beam_reader(pipeline)
-        ```
-
-        Only streamable datasets can be used with Beam. A streamable dataset is a
-        dataset that can be generated by a linear sequence of operations - without joins
-        for example. This is the case for Hugging Face datasets. If there are branches,
-        we'd need a more complex Beam pipeline.
-
-        The sharding is done on the filtered files. This is currently optimized for
-        Hugging Face datasets, so it raises an error if the dataset is not a Hugging
-        Face dataset.
-
-        Args:
-            A Beam pipeline.
-
-        Returns:
-            A Beam PCollection with all the records.
-
-        Raises:
-            A ValueError if the dataset is not streamable.
-        """
+        """See ReadFromCroissant docstring."""
         operations = self._filter_interesting_operations(self.filters)
         execute_downloads(operations)
         if not _is_streamable_dataset(operations):

@@ -121,6 +121,8 @@ def _equal_to_set(expected):
     """Checks whether 2 beam.PCollections are equal as sets."""
 
     def matcher_fn(actual):
+        # Sort by index, then remove the index from the PCollection returned by Beam:
+        actual = [element for _, element in sorted(list(actual))]
         expected_set = set([
             json.dumps(record_to_python(element)) for element in list(expected)
         ])

@@ -3,9 +3,12 @@
 from __future__ import annotations
 
 import collections
+from collections.abc import Iterable
 import concurrent.futures
+import functools
+import sys
 import typing
-from typing import Any
+from typing import Any, Generator
 
 from absl import logging
 import networkx as nx
@@ -22,6 +25,8 @@
 if typing.TYPE_CHECKING:
     import apache_beam as beam
 
+ElementWithIndex = tuple[int, Any]
+
 
 def execute_downloads(operations: Operations):
     """Executes all the downloads in the graph of operations."""
@@ -137,7 +142,7 @@ def read_all_files():
 def execute_operations_in_beam(
     pipeline: beam.Pipeline, record_set: str, operations: Operations
 ):
-    """See beam_reader docstring."""
+    """See ReadFromCroissant docstring."""
     import apache_beam as beam
 
     list_of_operations = _order_relevant_operations(operations, record_set)
@@ -149,12 +154,69 @@ def execute_operations_in_beam(
         files = operation(files)
         if isinstance(operation, FilterFiles):
             break
-    pipeline = pipeline | "Shard by files" >> beam.Create(files)
+    if not isinstance(files, Iterable):
+        raise ValueError("Could not filter files.")
+    files = list(files)  # even for large datasets, this can be handled in RAM.
+
+    # We first shard by file and assign a shard_index.
+    pipeline = pipeline | "Shard by files with index" >> beam.Create(enumerate(files))
+    num_shards = len(files)
+
+    # We don't know in advance the number of records per shards. So we just allocate the
+    # maximum number which is `sys.maxsize // num_shards`. Taking the practical case of
+    # large evenly distributed datasets on HuggingFace, we can compute the following:
+
+    # num_shards = number of Parquet files per config on Hugging Face < 10 billion files
+    # max_shard_size ~ 1 billion records per Parquet files
+
+    # So it seems we can run with this trick without too many problems. We still trigger
+    # a ValueError below if the error arises, and we ask the user to open a bug. A real
+    # solution to this problem would be to compute the shard_sizes in parallel of
+    # generating the records.
+    # TODO(https://github.com/mlcommons/croissant/issues/732): Compute shard_sizes
+    # explicitly instead of relying on max_shard_size.
+    max_shard_size = sys.maxsize // num_shards
     while queue_of_operations:
         operation = queue_of_operations.popleft()
         if isinstance(operation, ReadFields):
-            beam_operation = beam.ParDo(operation)
+            beam_operation = beam.ParDo(
+                functools.partial(
+                    _add_global_index,
+                    operation=operation,
+                    max_shard_size=max_shard_size,
+                )
+            )
         else:
-            beam_operation = beam.Map(operation)
+            beam_operation = beam.Map(
+                functools.partial(_pass_index, operation=operation)
+            )
         pipeline |= beam_operation
     return pipeline
+
+
+def _add_global_index(
+    element_with_index: ElementWithIndex,
+    operation: Operation,
+    max_shard_size: int,
+) -> Generator[ElementWithIndex, None, None]:
+    """Computes the global index given the shard size."""
+    shard_index, element = element_with_index
+    for index_in_shard, result in enumerate(operation(element)):
+        if index_in_shard >= max_shard_size:
+            raise ValueError(
+                "WARNING: This was very unlikely, but it seems we just hit this limit"
+                " in the code. Find another way to optimize execute_operations_in_beam."
+                " Please, open a PR on GitHub to make the maintainers aware of this"
+                " issue. A fix is to compute the actual shard_sizes rather than relying"
+                " on a heuristic (see comments above in code)."
+            )
+        new_index = max_shard_size * shard_index + index_in_shard
+        yield (new_index, result)
+
+
+def _pass_index(
+    element_with_index: tuple[int, Any], operation: Operation
+) -> ElementWithIndex:
+    """Passes the index to the next operation while executing the operation."""
+    index, element = element_with_index
+    return (index, operation(element))
@@ -304,8 +304,8 @@ def there_exists_at_least_one_property(self, possible_properties: list[str]):
         return False
 
     def __hash__(self):
-        """Hashes all immutable arguments."""
-        return hash(self.uuid)
+        """Re-uses parent's hash function (i.e., object)."""
+        return super().__hash__()
 
     def __eq__(self, other: Any) -> bool:
         """Compares two Nodes given their arguments."""
@@ -330,31 +330,6 @@ def __deepcopy__(self, memo):
         memo[id(self)] = copy
         return copy
 
-    def __reduce__(self):
-        """Allows pickling the node.
-
-        `self.ctx` is stored separately in the state because it's not pickable directly.
-        """
-        state = self.__getstate__()
-        args = tuple(state.values())
-        return (self.__class__, args, {"ctx": self.ctx})
-
-    def __getstate__(self):
-        """Overwrites __getstate__ for pickling."""
-        state = {}
-        for field in dataclasses.fields(self):
-            if field.name == "ctx":
-                ctx = Context()
-                ctx.graph = self.ctx.graph
-                state[field.name] = ctx
-            else:
-                state[field.name] = getattr(self, field.name)
-        return state
-
-    def __setstate__(self, state):
-        """Overwrites __setstate__ for pickling."""
-        self.ctx = state["ctx"]
-
     def to_json(self) -> Json:
         """Converts the Python class to JSON."""
         cls = self.__class__