Description
The test_ocr_xfund_google.py
test is failing and likely other tests too.
To reproduce the error: poetry run pytest -v tests/test_ocr_xfund_google.py
poetry run pytest -v tests/test_ocr_xfund_google.py ─╯
========================================================================= test session starts =========================================================================
platform darwin -- Python 3.10.16, pytest-7.4.4, pluggy-1.6.0 -- Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/bin/python
cachedir: .pytest_cache
rootdir: docling-eval
plugins: dependency-0.6.0, xdist-3.6.1
collected 1 item
tests/test_ocr_xfund_google.py::test_run_xfund_builder FAILED [100%]
============================================================================== FAILURES ===============================================================================
_______________________________________________________________________ test_run_xfund_builder ________________________________________________________________________
self = <datasets.packaged_modules.parquet.parquet.Parquet object at 0x14cd115a0>, gen_kwargs = {'files': []}
fpath = ' .cache/huggingface/datasets/parquet/default-6235769bde6b2eb0/0.0.0/9c460aabd2aa27d1496e5e38d2060760561f0ac2cd6a110134eefa5b3f153b8d.incomplete/parquet-test-JJJJJ-SSSSS-of-NNNNN.arrow'
file_format = 'arrow', max_shard_size = 500000000, job_id = 0
def _prepare_split_single(
self, gen_kwargs: dict, fpath: str, file_format: str, max_shard_size: int, job_id: int
) -> Iterable[tuple[int, bool, Union[int, tuple]]]:
gen_kwargs = {k: tracked_list(v) if isinstance(v, list) else v for k, v in gen_kwargs.items()}
generator = self._generate_tables(**gen_kwargs)
writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter
embed_local_files = file_format == "parquet"
shard_lengths = []
total_num_examples, total_num_bytes = 0, 0
shard_id = 0
num_examples_progress_update = 0
try:
writer = writer_class(
features=self.info.features,
path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
writer_batch_size=self._writer_batch_size,
storage_options=self._fs.storage_options,
embed_local_files=embed_local_files,
)
try:
_time = time.time()
for _, table in generator:
if max_shard_size is not None and writer._num_bytes > max_shard_size:
num_examples, num_bytes = writer.finalize()
writer.close()
shard_lengths.append(num_examples)
total_num_examples += num_examples
total_num_bytes += num_bytes
shard_id += 1
writer = writer_class(
features=writer._features,
path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
writer_batch_size=self._writer_batch_size,
storage_options=self._fs.storage_options,
embed_local_files=embed_local_files,
)
try:
writer.write_table(table)
except CastError as cast_error:
raise DatasetGenerationCastError.from_cast_error(
cast_error=cast_error,
builder_name=self.info.builder_name,
gen_kwargs=gen_kwargs,
token=self.token,
)
num_examples_progress_update += len(table)
if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:
_time = time.time()
yield job_id, False, num_examples_progress_update
num_examples_progress_update = 0
finally:
yield job_id, False, num_examples_progress_update
num_shards = shard_id + 1
> num_examples, num_bytes = writer.finalize()
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/datasets/builder.py:1887:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <datasets.arrow_writer.ArrowWriter object at 0x1500141c0>, close_stream = True
def finalize(self, close_stream=True):
self.write_rows_on_file()
# In case current_examples < writer_batch_size, but user uses finalize()
if self._check_duplicates:
self.check_duplicate_keys()
# Re-initializing to empty list for next batch
self.hkey_record = []
self.write_examples_on_file()
# If schema is known, infer features even if no examples were written
if self.pa_writer is None and self.schema:
self._build_writer(self.schema)
if self.pa_writer is not None:
self.pa_writer.close()
self.pa_writer = None
if close_stream:
self.stream.close()
else:
if close_stream:
self.stream.close()
> raise SchemaInferenceError("Please pass `features` or at least one example when writing data")
E datasets.arrow_writer.SchemaInferenceError: Please pass `features` or at least one example when writing data
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/datasets/arrow_writer.py:669: SchemaInferenceError
The above exception was the direct cause of the following exception:
@pytest.mark.skipif(
IS_CI, reason="Skipping test in CI because the dataset is too heavy."
)
def test_run_xfund_builder():
target_path = Path(f"./scratch/{BenchMarkNames.XFUND.value}_google/")
google_provider = GoogleDocAIPredictionProvider(
do_visualization=True, ignore_missing_predictions=False
)
dataset = XFUNDDatasetBuilder(
dataset_source=target_path / "input_dataset",
target=target_path / "gt_dataset",
end_index=2,
)
dataset.retrieve_input_dataset()
dataset.save_to_disk() # does all the job of iterating the dataset, making GT+prediction records, and saving them in shards as parquet.
> google_provider.create_prediction_dataset(
name=dataset.name,
gt_dataset_dir=target_path / "gt_dataset",
target_dataset_dir=target_path / "eval_dataset",
)
tests/test_ocr_xfund_google.py:40:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
docling_eval/prediction_providers/base_prediction_provider.py:332: in create_prediction_dataset
ds = load_dataset("parquet", data_files={split: parquet_files})
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/datasets/load.py:2084: in load_dataset
builder_instance.download_and_prepare(
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/datasets/builder.py:925: in download_and_prepare
self._download_and_prepare(
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/datasets/builder.py:1001: in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/datasets/builder.py:1742: in _prepare_split
for job_id, done, content in self._prepare_split_single(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <datasets.packaged_modules.parquet.parquet.Parquet object at 0x14cd115a0>, gen_kwargs = {'files': []}
fpath = cache/huggingface/datasets/parquet/default-6235769bde6b2eb0/0.0.0/9c460aabd2aa27d1496e5e38d2060760561f0ac2cd6a110134eefa5b3f153b8d.incomplete/parquet-test-JJJJJ-SSSSS-of-NNNNN.arrow'
file_format = 'arrow', max_shard_size = 500000000, job_id = 0
def _prepare_split_single(
self, gen_kwargs: dict, fpath: str, file_format: str, max_shard_size: int, job_id: int
) -> Iterable[tuple[int, bool, Union[int, tuple]]]:
gen_kwargs = {k: tracked_list(v) if isinstance(v, list) else v for k, v in gen_kwargs.items()}
generator = self._generate_tables(**gen_kwargs)
writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter
embed_local_files = file_format == "parquet"
shard_lengths = []
total_num_examples, total_num_bytes = 0, 0
shard_id = 0
num_examples_progress_update = 0
try:
writer = writer_class(
features=self.info.features,
path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
writer_batch_size=self._writer_batch_size,
storage_options=self._fs.storage_options,
embed_local_files=embed_local_files,
)
try:
_time = time.time()
for _, table in generator:
if max_shard_size is not None and writer._num_bytes > max_shard_size:
num_examples, num_bytes = writer.finalize()
writer.close()
shard_lengths.append(num_examples)
total_num_examples += num_examples
total_num_bytes += num_bytes
shard_id += 1
writer = writer_class(
features=writer._features,
path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
writer_batch_size=self._writer_batch_size,
storage_options=self._fs.storage_options,
embed_local_files=embed_local_files,
)
try:
writer.write_table(table)
except CastError as cast_error:
raise DatasetGenerationCastError.from_cast_error(
cast_error=cast_error,
builder_name=self.info.builder_name,
gen_kwargs=gen_kwargs,
token=self.token,
)
num_examples_progress_update += len(table)
if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:
_time = time.time()
yield job_id, False, num_examples_progress_update
num_examples_progress_update = 0
finally:
yield job_id, False, num_examples_progress_update
num_shards = shard_id + 1
num_examples, num_bytes = writer.finalize()
writer.close()
shard_lengths.append(num_examples)
total_num_examples += num_examples
total_num_bytes += num_bytes
except Exception as e:
# Ignore the writer's error for no examples written to the file if this error was caused by the error in _generate_examples before the first example was yielded
if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
e = e.__context__
if isinstance(e, DatasetGenerationError):
raise
> raise DatasetGenerationError("An error occurred while generating the dataset") from e
E datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/datasets/builder.py:1898: DatasetGenerationError
------------------------------------------------------------------------ Captured stderr call -------------------------------------------------------------------------
100%|██████████| 2/2 [00:02<00:00, 1.48s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 13.59ba/s]
Generating test split: 0 examples [00:00, ? examples/s]
========================================================================== warnings summary ===========================================================================
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/docling_core/types/doc/document.py:4112
tests/test_ocr_xfund_google.py::test_run_xfund_builder
tests/test_ocr_xfund_google.py::test_run_xfund_builder
tests/test_ocr_xfund_google.py::test_run_xfund_builder
tests/test_ocr_xfund_google.py::test_run_xfund_builder
ibrary/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/docling_core/types/doc/document.py:4112: DeprecationWarning: deprecated
if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/PyPDF2/__init__.py:21
ibrary/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/PyPDF2/__init__.py:21: DeprecationWarning: PyPDF2 is deprecated. Please move to the pypdf library instead.
warnings.warn(
../../../Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/pydantic/_internal/_config.py:323
Library/Caches/pypoetry/virtualenvs/docling-eval-ruLMhOMw-py3.10/lib/python3.10/site-packages/pydantic/_internal/_config.py:323: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
warnings.warn(DEPRECATION_MESSAGE, DeprecationWarning)
-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
======================================================================= short test summary info =======================================================================
FAILED tests/test_ocr_xfund_google.py::test_run_xfund_builder - datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset
============================================================== 1 failed, 7 warnings in 86.08s (0:01:26) ===============================================================