Skip to content

[Bee] An error occurred while generating the dataset #104

@samiuc

Description

@samiuc

The test_ocr_xfund_google.py test is failing and likely other tests too.

To reproduce the error: poetry run pytest -v tests/test_ocr_xfund_google.py

poetry run pytest -v tests/test_ocr_xfund_google.py                                                                                                              ─╯
========================================================================= test session starts =========================================================================
platform darwin -- Python 3.10.16, pytest-7.4.4, pluggy-1.6.0 -- Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/bin/python
cachedir: .pytest_cache
rootdir: docling-eval
plugins: dependency-0.6.0, xdist-3.6.1
collected 1 item                                                                                                                                                      

tests/test_ocr_xfund_google.py::test_run_xfund_builder FAILED                                                                                                   [100%]

============================================================================== FAILURES ===============================================================================
_______________________________________________________________________ test_run_xfund_builder ________________________________________________________________________

self = <datasets.packaged_modules.parquet.parquet.Parquet object at 0x14cd115a0>, gen_kwargs = {'files': []}
fpath = ' .cache/huggingface/datasets/parquet/default-6235769bde6b2eb0/0.0.0/9c460aabd2aa27d1496e5e38d2060760561f0ac2cd6a110134eefa5b3f153b8d.incomplete/parquet-test-JJJJJ-SSSSS-of-NNNNN.arrow'
file_format = 'arrow', max_shard_size = 500000000, job_id = 0

    def _prepare_split_single(
        self, gen_kwargs: dict, fpath: str, file_format: str, max_shard_size: int, job_id: int
    ) -> Iterable[tuple[int, bool, Union[int, tuple]]]:
        gen_kwargs = {k: tracked_list(v) if isinstance(v, list) else v for k, v in gen_kwargs.items()}
        generator = self._generate_tables(**gen_kwargs)
        writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter
        embed_local_files = file_format == "parquet"
        shard_lengths = []
        total_num_examples, total_num_bytes = 0, 0
    
        shard_id = 0
        num_examples_progress_update = 0
        try:
            writer = writer_class(
                features=self.info.features,
                path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
                writer_batch_size=self._writer_batch_size,
                storage_options=self._fs.storage_options,
                embed_local_files=embed_local_files,
            )
            try:
                _time = time.time()
                for _, table in generator:
                    if max_shard_size is not None and writer._num_bytes > max_shard_size:
                        num_examples, num_bytes = writer.finalize()
                        writer.close()
                        shard_lengths.append(num_examples)
                        total_num_examples += num_examples
                        total_num_bytes += num_bytes
                        shard_id += 1
                        writer = writer_class(
                            features=writer._features,
                            path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
                            writer_batch_size=self._writer_batch_size,
                            storage_options=self._fs.storage_options,
                            embed_local_files=embed_local_files,
                        )
                    try:
                        writer.write_table(table)
                    except CastError as cast_error:
                        raise DatasetGenerationCastError.from_cast_error(
                            cast_error=cast_error,
                            builder_name=self.info.builder_name,
                            gen_kwargs=gen_kwargs,
                            token=self.token,
                        )
                    num_examples_progress_update += len(table)
                    if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:
                        _time = time.time()
                        yield job_id, False, num_examples_progress_update
                        num_examples_progress_update = 0
            finally:
                yield job_id, False, num_examples_progress_update
                num_shards = shard_id + 1
>               num_examples, num_bytes = writer.finalize()

../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/datasets/builder.py:1887: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <datasets.arrow_writer.ArrowWriter object at 0x1500141c0>, close_stream = True

    def finalize(self, close_stream=True):
        self.write_rows_on_file()
        # In case current_examples < writer_batch_size, but user uses finalize()
        if self._check_duplicates:
            self.check_duplicate_keys()
            # Re-initializing to empty list for next batch
            self.hkey_record = []
        self.write_examples_on_file()
        # If schema is known, infer features even if no examples were written
        if self.pa_writer is None and self.schema:
            self._build_writer(self.schema)
        if self.pa_writer is not None:
            self.pa_writer.close()
            self.pa_writer = None
            if close_stream:
                self.stream.close()
        else:
            if close_stream:
                self.stream.close()
>           raise SchemaInferenceError("Please pass `features` or at least one example when writing data")
E           datasets.arrow_writer.SchemaInferenceError: Please pass `features` or at least one example when writing data

../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/datasets/arrow_writer.py:669: SchemaInferenceError

The above exception was the direct cause of the following exception:

    @pytest.mark.skipif(
        IS_CI, reason="Skipping test in CI because the dataset is too heavy."
    )
    def test_run_xfund_builder():
        target_path = Path(f"./scratch/{BenchMarkNames.XFUND.value}_google/")
        google_provider = GoogleDocAIPredictionProvider(
            do_visualization=True, ignore_missing_predictions=False
        )
    
        dataset = XFUNDDatasetBuilder(
            dataset_source=target_path / "input_dataset",
            target=target_path / "gt_dataset",
            end_index=2,
        )
    
        dataset.retrieve_input_dataset()
        dataset.save_to_disk()  # does all the job of iterating the dataset, making GT+prediction records, and saving them in shards as parquet.
    
>       google_provider.create_prediction_dataset(
            name=dataset.name,
            gt_dataset_dir=target_path / "gt_dataset",
            target_dataset_dir=target_path / "eval_dataset",
        )

tests/test_ocr_xfund_google.py:40: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
docling_eval/prediction_providers/base_prediction_provider.py:332: in create_prediction_dataset
    ds = load_dataset("parquet", data_files={split: parquet_files})
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/datasets/load.py:2084: in load_dataset
    builder_instance.download_and_prepare(
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/datasets/builder.py:925: in download_and_prepare
    self._download_and_prepare(
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/datasets/builder.py:1001: in _download_and_prepare
    self._prepare_split(split_generator, **prepare_split_kwargs)
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/datasets/builder.py:1742: in _prepare_split
    for job_id, done, content in self._prepare_split_single(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <datasets.packaged_modules.parquet.parquet.Parquet object at 0x14cd115a0>, gen_kwargs = {'files': []}
fpath =  cache/huggingface/datasets/parquet/default-6235769bde6b2eb0/0.0.0/9c460aabd2aa27d1496e5e38d2060760561f0ac2cd6a110134eefa5b3f153b8d.incomplete/parquet-test-JJJJJ-SSSSS-of-NNNNN.arrow'
file_format = 'arrow', max_shard_size = 500000000, job_id = 0

    def _prepare_split_single(
        self, gen_kwargs: dict, fpath: str, file_format: str, max_shard_size: int, job_id: int
    ) -> Iterable[tuple[int, bool, Union[int, tuple]]]:
        gen_kwargs = {k: tracked_list(v) if isinstance(v, list) else v for k, v in gen_kwargs.items()}
        generator = self._generate_tables(**gen_kwargs)
        writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter
        embed_local_files = file_format == "parquet"
        shard_lengths = []
        total_num_examples, total_num_bytes = 0, 0
    
        shard_id = 0
        num_examples_progress_update = 0
        try:
            writer = writer_class(
                features=self.info.features,
                path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
                writer_batch_size=self._writer_batch_size,
                storage_options=self._fs.storage_options,
                embed_local_files=embed_local_files,
            )
            try:
                _time = time.time()
                for _, table in generator:
                    if max_shard_size is not None and writer._num_bytes > max_shard_size:
                        num_examples, num_bytes = writer.finalize()
                        writer.close()
                        shard_lengths.append(num_examples)
                        total_num_examples += num_examples
                        total_num_bytes += num_bytes
                        shard_id += 1
                        writer = writer_class(
                            features=writer._features,
                            path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
                            writer_batch_size=self._writer_batch_size,
                            storage_options=self._fs.storage_options,
                            embed_local_files=embed_local_files,
                        )
                    try:
                        writer.write_table(table)
                    except CastError as cast_error:
                        raise DatasetGenerationCastError.from_cast_error(
                            cast_error=cast_error,
                            builder_name=self.info.builder_name,
                            gen_kwargs=gen_kwargs,
                            token=self.token,
                        )
                    num_examples_progress_update += len(table)
                    if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:
                        _time = time.time()
                        yield job_id, False, num_examples_progress_update
                        num_examples_progress_update = 0
            finally:
                yield job_id, False, num_examples_progress_update
                num_shards = shard_id + 1
                num_examples, num_bytes = writer.finalize()
                writer.close()
                shard_lengths.append(num_examples)
                total_num_examples += num_examples
                total_num_bytes += num_bytes
        except Exception as e:
            # Ignore the writer's error for no examples written to the file if this error was caused by the error in _generate_examples before the first example was yielded
            if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
                e = e.__context__
            if isinstance(e, DatasetGenerationError):
                raise
>           raise DatasetGenerationError("An error occurred while generating the dataset") from e
E           datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset

../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/datasets/builder.py:1898: DatasetGenerationError
------------------------------------------------------------------------ Captured stderr call -------------------------------------------------------------------------
100%|██████████| 2/2 [00:02<00:00,  1.48s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 13.59ba/s]
Generating test split: 0 examples [00:00, ? examples/s]
========================================================================== warnings summary ===========================================================================
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/docling_core/types/doc/document.py:4112
tests/test_ocr_xfund_google.py::test_run_xfund_builder
tests/test_ocr_xfund_google.py::test_run_xfund_builder
tests/test_ocr_xfund_google.py::test_run_xfund_builder
tests/test_ocr_xfund_google.py::test_run_xfund_builder
   ibrary/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/docling_core/types/doc/document.py:4112: DeprecationWarning: deprecated
    if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):

../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/PyPDF2/__init__.py:21
 ibrary/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/PyPDF2/__init__.py:21: DeprecationWarning: PyPDF2 is deprecated. Please move to the pypdf library instead.
    warnings.warn(

../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/pydantic/_internal/_config.py:323
   Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/pydantic/_internal/_config.py:323: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
    warnings.warn(DEPRECATION_MESSAGE, DeprecationWarning)

-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
======================================================================= short test summary info =======================================================================
FAILED tests/test_ocr_xfund_google.py::test_run_xfund_builder - datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset
============================================================== 1 failed, 7 warnings in 86.08s (0:01:26) ===============================================================

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions