There are multiple 'mteb/arguana' configurations in the cache: default, corpus, queries with HF_HUB_OFFLINE=1 #7359
Open
Description
Describe the bug
Hey folks,
I am trying to run this code -
from datasets import load_dataset, get_dataset_config_names
ds = load_dataset("mteb/arguana")
with HF_HUB_OFFLINE=1
But I get the following error -
Using the latest cached version of the dataset since mteb/arguana couldn't be found on the Hugging Face Hub (offline mode is enabled).
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[2], line 1
----> 1 ds = load_dataset("mteb/arguana")
File ~/env/lib/python3.10/site-packages/datasets/load.py:2129, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
2124 verification_mode = VerificationMode(
2125 (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
2126 )
2128 # Create a dataset builder
-> 2129 builder_instance = load_dataset_builder(
2130 path=path,
2131 name=name,
2132 data_dir=data_dir,
2133 data_files=data_files,
2134 cache_dir=cache_dir,
2135 features=features,
2136 download_config=download_config,
2137 download_mode=download_mode,
2138 revision=revision,
2139 token=token,
2140 storage_options=storage_options,
2141 trust_remote_code=trust_remote_code,
2142 _require_default_config_name=name is None,
2143 **config_kwargs,
2144 )
2146 # Return iterable dataset in case of streaming
2147 if streaming:
File ~/env/lib/python3.10/site-packages/datasets/load.py:1886, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, storage_options, trust_remote_code, _require_default_config_name, **config_kwargs)
1884 builder_cls = get_dataset_builder_class(dataset_module, dataset_name=dataset_name)
1885 # Instantiate the dataset builder
-> 1886 builder_instance: DatasetBuilder = builder_cls(
1887 cache_dir=cache_dir,
1888 dataset_name=dataset_name,
1889 config_name=config_name,
1890 data_dir=data_dir,
1891 data_files=data_files,
1892 hash=dataset_module.hash,
1893 info=info,
1894 features=features,
1895 token=token,
1896 storage_options=storage_options,
1897 **builder_kwargs,
1898 **config_kwargs,
1899 )
1900 builder_instance._use_legacy_cache_dir_if_possible(dataset_module)
1902 return builder_instance
File ~/env/lib/python3.10/site-packages/datasets/packaged_modules/cache/cache.py:124, in Cache.__init__(self, cache_dir, dataset_name, config_name, version, hash, base_path, info, features, token, repo_id, data_files, data_dir, storage_options, writer_batch_size, **config_kwargs)
122 config_kwargs["data_dir"] = data_dir
123 if hash == "auto" and version == "auto":
--> 124 config_name, version, hash = _find_hash_in_cache(
125 dataset_name=repo_id or dataset_name,
126 config_name=config_name,
127 cache_dir=cache_dir,
128 config_kwargs=config_kwargs,
129 custom_features=features,
130 )
131 elif hash == "auto" or version == "auto":
132 raise NotImplementedError("Pass both hash='auto' and version='auto' instead")
File ~/env/lib/python3.10/site-packages/datasets/packaged_modules/cache/cache.py:84, in _find_hash_in_cache(dataset_name, config_name, cache_dir, config_kwargs, custom_features)
72 other_configs = [
73 Path(_cached_directory_path).parts[-3]
74 for _cached_directory_path in glob.glob(os.path.join(cached_datasets_directory_path_root, "*", version, hash))
(...)
81 )
82 ]
83 if not config_id and len(other_configs) > 1:
---> 84 raise ValueError(
85 f"There are multiple '{dataset_name}' configurations in the cache: {', '.join(other_configs)}"
86 f"\nPlease specify which configuration to reload from the cache, e.g."
87 f"\n\tload_dataset('{dataset_name}', '{other_configs[0]}')"
88 )
89 config_name = cached_directory_path.parts[-3]
90 warning_msg = (
91 f"Found the latest cached dataset configuration '{config_name}' at {cached_directory_path} "
92 f"(last modified on {time.ctime(_get_modification_time(cached_directory_path))})."
93 )
ValueError: There are multiple 'mteb/arguana' configurations in the cache: queries, corpus, default
Please specify which configuration to reload from the cache, e.g.
load_dataset('mteb/arguana', 'queries')
It works when I run the same code with HF_HUB_OFFLINE=0, but after the data is downloaded, I turn off the HF hub cache with HF_HUB_OFFLINE=1, and then this error appears.
Are there some files I am missing with hub disabled?
Steps to reproduce the bug
from datasets import load_dataset, get_dataset_config_names
ds = load_dataset("mteb/arguana")
with HF_HUB_OFFLINE=1
(after already running it with HF_HUB_OFFLINE=0 and populating the datasets cache)
Expected behavior
Dataset loaded successfully as it does with HF_HUB_OFFLINE=1
Environment info
datasets
version: 3.2.0- Platform: Linux-5.15.148.2-2.cm2-x86_64-with-glibc2.35
- Python version: 3.10.14
huggingface_hub
version: 0.27.0- PyArrow version: 17.0.0
- Pandas version: 2.2.3
fsspec
version: 2024.6.1
Metadata
Assignees
Labels
No labels