Skip to content

Commit

Permalink
Remove beam (#6987)
Browse files Browse the repository at this point in the history
* Delete beam tests

* Delete BeamBasedBuilder

* Delete BeamWriter

* Delete RunBeamCommand

* Delete DownloadManager.ship_files_with_pipeline

* Delete beam_utils

* Delete require_beam

* Delete config beam variables

* Delete apache-beam extras require

* Update setup.py without dependency conflicting with apache-beam

* Delete Beam from docs

* Delete Beam from comments and docstrings

* Delete tests of HF GCP
  • Loading branch information
albertvillanova authored Jun 26, 2024
1 parent 637246b commit b275462
Show file tree
Hide file tree
Showing 19 changed files with 9 additions and 1,069 deletions.
2 changes: 0 additions & 2 deletions docs/source/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@
title: CLI
- local: how_to_metrics
title: Metrics
- local: beam
title: Beam Datasets
- local: troubleshoot
title: Troubleshooting
title: "General usage"
Expand Down
52 changes: 0 additions & 52 deletions docs/source/beam.mdx

This file was deleted.

3 changes: 1 addition & 2 deletions docs/source/cli.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,11 @@ You can check the available commands:
usage: datasets-cli <command> [<args>]

positional arguments:
{convert,env,test,run_beam,dummy_data,convert_to_parquet}
{convert,env,test,dummy_data,convert_to_parquet}
datasets-cli command helpers
convert Convert a TensorFlow Datasets dataset to a HuggingFace Datasets dataset.
env Print relevant system environment info.
test Test dataset implementation.
run_beam Run a Beam dataset processing pipeline
dummy_data Generate dummy data.
convert_to_parquet Convert dataset to Parquet
delete_from_hub Delete dataset config from the Hub
Expand Down
2 changes: 0 additions & 2 deletions docs/source/package_reference/builder_classes.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

[[autodoc]] datasets.GeneratorBasedBuilder

[[autodoc]] datasets.BeamBasedBuilder

[[autodoc]] datasets.ArrowBasedBuilder

[[autodoc]] datasets.BuilderConfig
Expand Down
2 changes: 0 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@
"torch>=2.0.0",
"soundfile>=0.12.1",
"transformers",
"typing-extensions>=4.6.1", # due to conflict between apache-beam and pydantic
"zstandard",
"polars[timezone]>=0.20.0",
]
Expand Down Expand Up @@ -230,7 +229,6 @@
EXTRAS_REQUIRE = {
"audio": AUDIO_REQUIRE,
"vision": VISION_REQUIRE,
"apache-beam": ["apache-beam>=2.26.0"],
"tensorflow": [
"tensorflow>=2.6.0",
],
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from .arrow_dataset import Dataset
from .arrow_reader import ReadInstruction
from .builder import ArrowBasedBuilder, BeamBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
from .builder import ArrowBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
from .combine import concatenate_datasets, interleave_datasets
from .dataset_dict import DatasetDict, IterableDatasetDict
from .download import *
Expand Down
132 changes: 0 additions & 132 deletions src/datasets/arrow_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,8 @@
# Lint as: python3
"""To write records into Parquet files."""

import errno
import json
import os
import sys
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

import fsspec
Expand All @@ -43,8 +40,6 @@
from .keyhash import DuplicatedKeysError, KeyHasher
from .table import array_cast, cast_array_to_feature, embed_table_storage, table_cast
from .utils import logging
from .utils import tqdm as hf_tqdm
from .utils.file_utils import hash_url_to_filename
from .utils.py_utils import asdict, first_non_null_value


Expand Down Expand Up @@ -617,130 +612,3 @@ def finalize(self, close_stream=True):

class ParquetWriter(ArrowWriter):
_WRITER_CLASS = pq.ParquetWriter


class BeamWriter:
"""
Shuffles and writes Examples to Arrow files.
The Arrow files are converted from Parquet files that are the output of Apache Beam pipelines.
"""

def __init__(
self,
features: Optional[Features] = None,
schema: Optional[pa.Schema] = None,
path: Optional[str] = None,
namespace: Optional[str] = None,
cache_dir: Optional[str] = None,
):
if features is None and schema is None:
raise ValueError("At least one of features and schema must be provided.")
if path is None:
raise ValueError("Path must be provided.")

if features is not None:
self._features: Features = features
self._schema: pa.Schema = features.arrow_schema
else:
self._schema: pa.Schema = schema
self._features: Features = Features.from_arrow_schema(schema)

self._path = path
self._parquet_path = os.path.splitext(path)[0] # remove extension
self._namespace = namespace or "default"
self._num_examples = None
self._cache_dir = cache_dir or config.HF_DATASETS_CACHE

def write_from_pcollection(self, pcoll_examples):
"""Add the final steps of the beam pipeline: write to parquet files."""
import apache_beam as beam

def inc_num_examples(example):
beam.metrics.Metrics.counter(self._namespace, "num_examples").inc()

# count examples
_ = pcoll_examples | "Count N. Examples" >> beam.Map(inc_num_examples)

# save dataset
return (
pcoll_examples
| "Get values" >> beam.Values()
| "Save to parquet"
>> beam.io.parquetio.WriteToParquet(
self._parquet_path, self._schema, shard_name_template="-SSSSS-of-NNNNN.parquet"
)
)

def finalize(self, metrics_query_result: dict):
"""
Run after the pipeline has finished.
It converts the resulting parquet files to arrow and it completes the info from the pipeline metrics.
Args:
metrics_query_result: `dict` obtained from pipeline_results.metrics().query(m_filter). Make sure
that the filter keeps only the metrics for the considered split, under the namespace `split_name`.
"""

# Beam FileSystems require the system's path separator in the older versions
fs, parquet_path = url_to_fs(self._parquet_path)
parquet_path = str(Path(parquet_path)) if not is_remote_filesystem(fs) else fs.unstrip_protocol(parquet_path)

shards = fs.glob(parquet_path + "*.parquet")
num_bytes = sum(fs.sizes(shards))
shard_lengths = get_parquet_lengths(shards)

# Convert to arrow
if self._path.endswith(".arrow"):
logger.info(f"Converting parquet files {self._parquet_path} to arrow {self._path}")
try: # stream conversion
num_bytes = 0
for shard in hf_tqdm(shards, unit="shards"):
with fs.open(shard, "rb") as source:
with fs.open(shard.replace(".parquet", ".arrow"), "wb") as destination:
shard_num_bytes, _ = parquet_to_arrow(source, destination)
num_bytes += shard_num_bytes
except OSError as e: # broken pipe can happen if the connection is unstable, do local conversion instead
if e.errno != errno.EPIPE: # not a broken pipe
raise
logger.warning(
"Broken Pipe during stream conversion from parquet to arrow. Using local convert instead"
)
local_convert_dir = os.path.join(self._cache_dir, "beam_convert")
os.makedirs(local_convert_dir, exist_ok=True)
num_bytes = 0
for shard in hf_tqdm(shards, unit="shards"):
local_parquet_path = os.path.join(local_convert_dir, hash_url_to_filename(shard) + ".parquet")
fs.download(shard, local_parquet_path)
local_arrow_path = local_parquet_path.replace(".parquet", ".arrow")
shard_num_bytes, _ = parquet_to_arrow(local_parquet_path, local_arrow_path)
num_bytes += shard_num_bytes
remote_arrow_path = shard.replace(".parquet", ".arrow")
fs.upload(local_arrow_path, remote_arrow_path)

# Save metrics
counters_dict = {metric.key.metric.name: metric.result for metric in metrics_query_result["counters"]}
self._num_examples = counters_dict["num_examples"]
self._num_bytes = num_bytes
self._shard_lengths = shard_lengths
return self._num_examples, self._num_bytes


def get_parquet_lengths(sources) -> List[int]:
shard_lengths = []
for source in hf_tqdm(sources, unit="parquet files"):
parquet_file = pa.parquet.ParquetFile(source)
shard_lengths.append(parquet_file.metadata.num_rows)
return shard_lengths


def parquet_to_arrow(source, destination) -> List[int]:
"""Convert parquet file to arrow file. Inputs can be str paths or file-like objects"""
stream = None if isinstance(destination, str) else destination
parquet_file = pa.parquet.ParquetFile(source)
# Beam can create empty Parquet files, so we need to pass the source Parquet file's schema
with ArrowWriter(schema=parquet_file.schema_arrow, path=destination, stream=stream) as writer:
for record_batch in parquet_file.iter_batches():
pa_table = pa.Table.from_batches([record_batch])
writer.write_table(pa_table)
num_bytes, num_examples = writer.finalize()
return num_bytes, num_examples
Loading

0 comments on commit b275462

Please sign in to comment.