Remove beam (#6987)

* Delete beam tests * Delete BeamBasedBuilder * Delete BeamWriter * Delete RunBeamCommand * Delete DownloadManager.ship_files_with_pipeline * Delete beam_utils * Delete require_beam * Delete config beam variables * Delete apache-beam extras require * Update setup.py without dependency conflicting with apache-beam * Delete Beam from docs * Delete Beam from comments and docstrings * Delete tests of HF GCP
huggingface · Jun 26, 2024 · b275462 · b275462
1 parent 637246b
commit b275462
Show file tree

Hide file tree

Showing 19 changed files with 9 additions and 1,069 deletions.
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -50,8 +50,6 @@
       title: CLI
     - local: how_to_metrics
       title: Metrics
-    - local: beam
-      title: Beam Datasets
     - local: troubleshoot
       title: Troubleshooting
     title: "General usage"

diff --git a/docs/source/beam.mdx b/docs/source/beam.mdx
diff --git a/docs/source/cli.mdx b/docs/source/cli.mdx
@@ -8,12 +8,11 @@ You can check the available commands:
 usage: datasets-cli <command> [<args>]
 
 positional arguments:
-  {convert,env,test,run_beam,dummy_data,convert_to_parquet}
+  {convert,env,test,dummy_data,convert_to_parquet}
                         datasets-cli command helpers
     convert             Convert a TensorFlow Datasets dataset to a HuggingFace Datasets dataset.
     env                 Print relevant system environment info.
     test                Test dataset implementation.
-    run_beam            Run a Beam dataset processing pipeline
     dummy_data          Generate dummy data.
     convert_to_parquet  Convert dataset to Parquet
     delete_from_hub     Delete dataset config from the Hub

diff --git a/docs/source/package_reference/builder_classes.mdx b/docs/source/package_reference/builder_classes.mdx
@@ -8,8 +8,6 @@
 
 [[autodoc]] datasets.GeneratorBasedBuilder
 
-[[autodoc]] datasets.BeamBasedBuilder
-
 [[autodoc]] datasets.ArrowBasedBuilder
 
 [[autodoc]] datasets.BuilderConfig

diff --git a/setup.py b/setup.py
@@ -180,7 +180,6 @@
     "torch>=2.0.0",
     "soundfile>=0.12.1",
     "transformers",
-    "typing-extensions>=4.6.1",  # due to conflict between apache-beam and pydantic
     "zstandard",
     "polars[timezone]>=0.20.0",
 ]
@@ -230,7 +229,6 @@
 EXTRAS_REQUIRE = {
     "audio": AUDIO_REQUIRE,
     "vision": VISION_REQUIRE,
-    "apache-beam": ["apache-beam>=2.26.0"],
     "tensorflow": [
         "tensorflow>=2.6.0",
     ],

diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
@@ -16,7 +16,7 @@
 
 from .arrow_dataset import Dataset
 from .arrow_reader import ReadInstruction
-from .builder import ArrowBasedBuilder, BeamBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
+from .builder import ArrowBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
 from .combine import concatenate_datasets, interleave_datasets
 from .dataset_dict import DatasetDict, IterableDatasetDict
 from .download import *

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
@@ -13,11 +13,8 @@
 # Lint as: python3
 """To write records into Parquet files."""
 
-import errno
 import json
-import os
 import sys
-from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import fsspec
@@ -43,8 +40,6 @@
 from .keyhash import DuplicatedKeysError, KeyHasher
 from .table import array_cast, cast_array_to_feature, embed_table_storage, table_cast
 from .utils import logging
-from .utils import tqdm as hf_tqdm
-from .utils.file_utils import hash_url_to_filename
 from .utils.py_utils import asdict, first_non_null_value
 
 
@@ -617,130 +612,3 @@ def finalize(self, close_stream=True):
 
 class ParquetWriter(ArrowWriter):
     _WRITER_CLASS = pq.ParquetWriter
-
-
-class BeamWriter:
-    """
-    Shuffles and writes Examples to Arrow files.
-    The Arrow files are converted from Parquet files that are the output of Apache Beam pipelines.
-    """
-
-    def __init__(
-        self,
-        features: Optional[Features] = None,
-        schema: Optional[pa.Schema] = None,
-        path: Optional[str] = None,
-        namespace: Optional[str] = None,
-        cache_dir: Optional[str] = None,
-    ):
-        if features is None and schema is None:
-            raise ValueError("At least one of features and schema must be provided.")
-        if path is None:
-            raise ValueError("Path must be provided.")
-
-        if features is not None:
-            self._features: Features = features
-            self._schema: pa.Schema = features.arrow_schema
-        else:
-            self._schema: pa.Schema = schema
-            self._features: Features = Features.from_arrow_schema(schema)
-
-        self._path = path
-        self._parquet_path = os.path.splitext(path)[0]  # remove extension
-        self._namespace = namespace or "default"
-        self._num_examples = None
-        self._cache_dir = cache_dir or config.HF_DATASETS_CACHE
-
-    def write_from_pcollection(self, pcoll_examples):
-        """Add the final steps of the beam pipeline: write to parquet files."""
-        import apache_beam as beam
-
-        def inc_num_examples(example):
-            beam.metrics.Metrics.counter(self._namespace, "num_examples").inc()
-
-        # count examples
-        _ = pcoll_examples | "Count N. Examples" >> beam.Map(inc_num_examples)
-
-        # save dataset
-        return (
-            pcoll_examples
-            | "Get values" >> beam.Values()
-            | "Save to parquet"
-            >> beam.io.parquetio.WriteToParquet(
-                self._parquet_path, self._schema, shard_name_template="-SSSSS-of-NNNNN.parquet"
-            )
-        )
-
-    def finalize(self, metrics_query_result: dict):
-        """
-        Run after the pipeline has finished.
-        It converts the resulting parquet files to arrow and it completes the info from the pipeline metrics.
-
-        Args:
-            metrics_query_result: `dict` obtained from pipeline_results.metrics().query(m_filter). Make sure
-                that the filter keeps only the metrics for the considered split, under the namespace `split_name`.
-        """
-
-        # Beam FileSystems require the system's path separator in the older versions
-        fs, parquet_path = url_to_fs(self._parquet_path)
-        parquet_path = str(Path(parquet_path)) if not is_remote_filesystem(fs) else fs.unstrip_protocol(parquet_path)
-
-        shards = fs.glob(parquet_path + "*.parquet")
-        num_bytes = sum(fs.sizes(shards))
-        shard_lengths = get_parquet_lengths(shards)
-
-        # Convert to arrow
-        if self._path.endswith(".arrow"):
-            logger.info(f"Converting parquet files {self._parquet_path} to arrow {self._path}")
-            try:  # stream conversion
-                num_bytes = 0
-                for shard in hf_tqdm(shards, unit="shards"):
-                    with fs.open(shard, "rb") as source:
-                        with fs.open(shard.replace(".parquet", ".arrow"), "wb") as destination:
-                            shard_num_bytes, _ = parquet_to_arrow(source, destination)
-                            num_bytes += shard_num_bytes
-            except OSError as e:  # broken pipe can happen if the connection is unstable, do local conversion instead
-                if e.errno != errno.EPIPE:  # not a broken pipe
-                    raise
-                logger.warning(
-                    "Broken Pipe during stream conversion from parquet to arrow. Using local convert instead"
-                )
-                local_convert_dir = os.path.join(self._cache_dir, "beam_convert")
-                os.makedirs(local_convert_dir, exist_ok=True)
-                num_bytes = 0
-                for shard in hf_tqdm(shards, unit="shards"):
-                    local_parquet_path = os.path.join(local_convert_dir, hash_url_to_filename(shard) + ".parquet")
-                    fs.download(shard, local_parquet_path)
-                    local_arrow_path = local_parquet_path.replace(".parquet", ".arrow")
-                    shard_num_bytes, _ = parquet_to_arrow(local_parquet_path, local_arrow_path)
-                    num_bytes += shard_num_bytes
-                    remote_arrow_path = shard.replace(".parquet", ".arrow")
-                    fs.upload(local_arrow_path, remote_arrow_path)
-
-        # Save metrics
-        counters_dict = {metric.key.metric.name: metric.result for metric in metrics_query_result["counters"]}
-        self._num_examples = counters_dict["num_examples"]
-        self._num_bytes = num_bytes
-        self._shard_lengths = shard_lengths
-        return self._num_examples, self._num_bytes
-
-
-def get_parquet_lengths(sources) -> List[int]:
-    shard_lengths = []
-    for source in hf_tqdm(sources, unit="parquet files"):
-        parquet_file = pa.parquet.ParquetFile(source)
-        shard_lengths.append(parquet_file.metadata.num_rows)
-    return shard_lengths
-
-
-def parquet_to_arrow(source, destination) -> List[int]:
-    """Convert parquet file to arrow file. Inputs can be str paths or file-like objects"""
-    stream = None if isinstance(destination, str) else destination
-    parquet_file = pa.parquet.ParquetFile(source)
-    # Beam can create empty Parquet files, so we need to pass the source Parquet file's schema
-    with ArrowWriter(schema=parquet_file.schema_arrow, path=destination, stream=stream) as writer:
-        for record_batch in parquet_file.iter_batches():
-            pa_table = pa.Table.from_batches([record_batch])
-            writer.write_table(pa_table)
-        num_bytes, num_examples = writer.finalize()
-    return num_bytes, num_examples