Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
7873621
Let's test like this
Wauplin Sep 18, 2025
07b3a9d
code quality
Wauplin Sep 18, 2025
532e507
add back requests
Wauplin Sep 19, 2025
fe845fb
install transformers from source
Wauplin Sep 19, 2025
0b1d915
t :wq
Wauplin Sep 22, 2025
8b5e0f3
will it work?
Wauplin Sep 22, 2025
09ac910
to remove later: don't fail fast
Wauplin Sep 22, 2025
bb92173
don't fail fast
Wauplin Sep 22, 2025
bd6945c
fix test fixture
Wauplin Sep 22, 2025
ac365fa
fix OfflineModeIsEnabled test
Wauplin Sep 22, 2025
248b052
huggingface_hub 1.0.0 even if deps latest
Wauplin Sep 22, 2025
167882c
will be broken but better
Wauplin Sep 23, 2025
3c7c555
pip list in CI
Wauplin Sep 24, 2025
67f8d2a
revert branch
Wauplin Sep 24, 2025
9a23ff5
install latest only in latest tests
Wauplin Sep 24, 2025
9efbb02
offline
Wauplin Sep 24, 2025
48e55a4
get back to normal
Wauplin Sep 29, 2025
ac0366d
better
Wauplin Sep 29, 2025
01fd011
ofc
Wauplin Sep 29, 2025
6a7151e
why not
Wauplin Sep 29, 2025
b799352
as before
Wauplin Sep 29, 2025
784256c
this time is good
Wauplin Sep 29, 2025
f7dcf49
fix yaml format
Wauplin Sep 29, 2025
ace01a3
system
Wauplin Sep 29, 2025
a472f80
fix import in o.x
Wauplin Sep 29, 2025
f4f0f6e
:/
Wauplin Sep 29, 2025
83e3d2c
Bump minimal version to 0.25.0
Wauplin Sep 30, 2025
e39a04d
x-compatible offline helper
Wauplin Sep 30, 2025
d6e6dbc
code quality
Wauplin Sep 30, 2025
e94c469
fix utils tests
Wauplin Sep 30, 2025
f9f2f00
fixing last bits
Wauplin Sep 30, 2025
50f40c2
x-version compat
Wauplin Sep 30, 2025
e200c23
Merge branch 'main' into ci-test-huggingface-hub-v1.0.0.rc0
Wauplin Sep 30, 2025
7903f24
final commit
Wauplin Sep 30, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,16 @@ jobs:
run: uv pip install --system "datasets[tests] @ ."
- name: Install dependencies (latest versions)
if: ${{ matrix.deps_versions == 'deps-latest' }}
run: uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9"
run: |
uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9"
# TODO: remove once transformers v5 / huggingface_hub v1 are released officially
uv pip uninstall --system transformers huggingface_hub
uv pip install --system --prerelease=allow git+https://github.com/huggingface/transformers.git
- name: Install dependencies (minimum versions)
if: ${{ matrix.deps_versions != 'deps-latest' }}
run: uv pip install --system pyarrow==21.0.0 huggingface-hub==0.24.7 transformers dill==0.3.1.1
run: uv pip install --system pyarrow==21.0.0 huggingface-hub==0.25.0 transformers dill==0.3.1.1
- name: Print dependencies
run: uv pip list
- name: Test with pytest
run: |
python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
Expand Down Expand Up @@ -119,6 +125,8 @@ jobs:
run: pip install --upgrade uv
- name: Install dependencies
run: uv pip install --system "datasets[tests] @ ."
- name: Print dependencies
run: uv pip list
- name: Test with pytest
run: |
python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
Expand Down Expand Up @@ -161,7 +169,14 @@ jobs:
- name: Install uv
run: pip install --upgrade uv
- name: Install dependencies
run: uv pip install --system "datasets[tests_numpy2] @ ."
run: |
uv pip install --system "datasets[tests_numpy2] @ ."
# TODO: remove once transformers v5 / huggingface_hub v1 are released officially
uv pip uninstall --system transformers huggingface_hub
uv pip install --system --prerelease=allow git+https://github.com/huggingface/transformers.git
- name: Print dependencies
run: pip list

- name: Test with pytest
run: |
python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@
"pandas",
# for downloading datasets over HTTPS
"requests>=2.32.2",
"httpx<1.0.0",
# progress bars in downloads and data operations
"tqdm>=4.66.3",
# for fast hashing
Expand All @@ -128,7 +129,7 @@
# minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143
"fsspec[http]>=2023.1.0,<=2025.9.0",
# To get datasets from the Datasets Hub on huggingface.co
"huggingface-hub>=0.24.0",
"huggingface-hub>=0.25.0,<2.0",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bumping to 0.25.0 to be able to use update_repo_settings in tests. Honestly not a big breaking change IMO (dates back to Sept. 2024)

# Utilities from PyPA to e.g., compare versions
"packaging",
# To parse YAML metadata from dataset cards
Expand Down
8 changes: 4 additions & 4 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@
DatasetCardData,
HfApi,
)
from huggingface_hub.hf_api import HfHubHTTPError, RepoFile, RepositoryNotFoundError
from huggingface_hub.hf_api import RepoFile
from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
from multiprocess import Pool
from requests import HTTPError
from tqdm.contrib.concurrent import thread_map

from . import config
Expand Down Expand Up @@ -5990,7 +5990,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
except HfHubHTTPError as err:
if (
err.__context__
and isinstance(err.__context__, HTTPError)
and isinstance(err.__context__, HfHubHTTPError)
and err.__context__.response.status_code == 409
):
# 409 is Conflict (another commit is in progress)
Expand Down Expand Up @@ -6040,7 +6040,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
except HfHubHTTPError as err:
if (
err.__context__
and isinstance(err.__context__, HTTPError)
and isinstance(err.__context__, HfHubHTTPError)
and err.__context__.response.status_code in (412, 409)
):
# 412 is Precondition failed (parent_commit isn't satisfied)
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ def resolve_pattern(
protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0]
protocol_prefix = protocol + "://" if protocol != "file" else ""
glob_kwargs = {}
if protocol == "hf" and config.HF_HUB_VERSION >= version.parse("0.20.0"):
if protocol == "hf":
# 10 times faster glob with detail=True (ignores costly info like lastCommit)
glob_kwargs["expand_info"] = False
matched_paths = [
Expand Down
9 changes: 4 additions & 5 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
)
from huggingface_hub.hf_api import RepoFile
from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
from requests import HTTPError

from . import config
from .arrow_dataset import (
Expand Down Expand Up @@ -1917,7 +1916,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
except HfHubHTTPError as err:
if (
err.__context__
and isinstance(err.__context__, HTTPError)
and isinstance(err.__context__, HfHubHTTPError)
and err.__context__.response.status_code == 409
):
# 409 is Conflict (another commit is in progress)
Expand Down Expand Up @@ -1967,7 +1966,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
except HfHubHTTPError as err:
if (
err.__context__
and isinstance(err.__context__, HTTPError)
and isinstance(err.__context__, HfHubHTTPError)
and err.__context__.response.status_code in (412, 409)
):
# 412 is Precondition failed (parent_commit isn't satisfied)
Expand Down Expand Up @@ -2786,7 +2785,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
except HfHubHTTPError as err:
if (
err.__context__
and isinstance(err.__context__, HTTPError)
and isinstance(err.__context__, HfHubHTTPError)
and err.__context__.response.status_code == 409
):
# 409 is Conflict (another commit is in progress)
Expand Down Expand Up @@ -2836,7 +2835,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
except HfHubHTTPError as err:
if (
err.__context__
and isinstance(err.__context__, HTTPError)
and isinstance(err.__context__, HfHubHTTPError)
and err.__context__.response.status_code in (412, 409)
):
# 412 is Precondition failed (parent_commit isn't satisfied)
Expand Down
5 changes: 2 additions & 3 deletions src/datasets/iterable_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from huggingface_hub.hf_api import RepoFile
from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
from multiprocess import Pool
from requests import HTTPError

from . import config
from .arrow_dataset import PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED, Dataset, DatasetInfoMixin
Expand Down Expand Up @@ -4332,7 +4331,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
except HfHubHTTPError as err:
if (
err.__context__
and isinstance(err.__context__, HTTPError)
and isinstance(err.__context__, HfHubHTTPError)
and err.__context__.response.status_code == 409
):
# 409 is Conflict (another commit is in progress)
Expand Down Expand Up @@ -4382,7 +4381,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
except HfHubHTTPError as err:
if (
err.__context__
and isinstance(err.__context__, HTTPError)
and isinstance(err.__context__, HfHubHTTPError)
and err.__context__.response.status_code in (412, 409)
):
# 412 is Precondition failed (parent_commit isn't satisfied)
Expand Down
5 changes: 5 additions & 0 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from typing import Any, Optional, Union

import fsspec
import httpx
import requests
import yaml
from fsspec.core import url_to_fs
Expand Down Expand Up @@ -948,6 +949,8 @@ def dataset_module_factory(
OfflineModeIsEnabled,
requests.exceptions.Timeout,
requests.exceptions.ConnectionError,
httpx.ConnectError,
httpx.TimeoutException,
),
):
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e
Expand All @@ -963,6 +966,8 @@ def dataset_module_factory(
OfflineModeIsEnabled,
requests.exceptions.Timeout,
requests.exceptions.ConnectionError,
httpx.ConnectError,
httpx.TimeoutException,
) as e:
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e
except GatedRepoError as e:
Expand Down
13 changes: 6 additions & 7 deletions src/datasets/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,13 @@
from xml.etree import ElementTree as ET

import fsspec
import httpx
import huggingface_hub
import huggingface_hub.errors
import requests
from fsspec.core import strip_protocol, url_to_fs
from fsspec.utils import can_be_local
from huggingface_hub.utils import EntryNotFoundError, get_session, insecure_hashlib
from huggingface_hub.utils import get_session, insecure_hashlib
from packaging import version

from .. import __version__, config
Expand Down Expand Up @@ -140,7 +141,7 @@ def cached_path(
ConnectionError: in case of unreachable url
and no cache on disk
ValueError: if it couldn't parse the url or filename correctly
requests.exceptions.ConnectionError: in case of internet connection issue
httpx.NetworkError or requests.exceptions.ConnectionError: in case of internet connection issue
"""
if download_config is None:
download_config = DownloadConfig(**download_kwargs)
Expand Down Expand Up @@ -246,7 +247,7 @@ def cached_path(
def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str:
ua = f"datasets/{__version__}"
ua += f"; python/{config.PY_VERSION}"
ua += f"; huggingface_hub/{huggingface_hub.__version__}"
ua += f"; hf_hub/{huggingface_hub.__version__}"
ua += f"; pyarrow/{config.PYARROW_VERSION}"
if config.TORCH_AVAILABLE:
ua += f"; torch/{config.TORCH_VERSION}"
Expand Down Expand Up @@ -753,7 +754,7 @@ def xgetsize(path, download_config: Optional[DownloadConfig] = None) -> int:
fs, *_ = fs, *_ = url_to_fs(path, **storage_options)
try:
size = fs.size(main_hop)
except EntryNotFoundError:
except huggingface_hub.utils.EntryNotFoundError:
raise FileNotFoundError(f"No such file: {path}")
if size is None:
# use xopen instead of fs.open to make data fetching more robust
Expand Down Expand Up @@ -817,6 +818,7 @@ def read_with_retries(*args, **kwargs):
asyncio.TimeoutError,
requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
httpx.RequestError,
) as err:
disconnect_err = err
logger.warning(
Expand Down Expand Up @@ -897,9 +899,6 @@ def _prepare_single_hop_path_and_storage_options(
"endpoint": config.HF_ENDPOINT,
**storage_options,
}
# streaming with block_size=0 is only implemented in 0.21 (see https://github.com/huggingface/huggingface_hub/pull/1967)
if config.HF_HUB_VERSION < version.parse("0.21.0"):
storage_options["block_size"] = "default"
if storage_options:
storage_options = {protocol: storage_options}
return urlpath, storage_options
Expand Down
8 changes: 0 additions & 8 deletions src/datasets/utils/hub.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
from functools import partial

from huggingface_hub import hf_hub_url
from huggingface_hub.utils import get_session, hf_raise_for_status


hf_dataset_url = partial(hf_hub_url, repo_type="dataset")


def check_auth(hf_api, repo_id, token=None):
headers = hf_api._build_hf_headers(token=token)
path = f"{hf_api.endpoint}/api/datasets/{repo_id}/auth-check"
r = get_session().get(path, headers=headers)
hf_raise_for_status(r)
33 changes: 15 additions & 18 deletions tests/fixtures/hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
from typing import Optional

import pytest
import requests
from huggingface_hub.hf_api import HfApi, RepositoryNotFoundError
from huggingface_hub.utils import hf_raise_for_status
from huggingface_hub.hf_api import HfApi
from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
from huggingface_hub.utils._headers import _http_user_agent


Expand All @@ -24,9 +23,14 @@
def ci_hub_config(monkeypatch):
monkeypatch.setattr("datasets.config.HF_ENDPOINT", CI_HUB_ENDPOINT)
monkeypatch.setattr("datasets.config.HUB_DATASETS_URL", CI_HUB_DATASETS_URL)
monkeypatch.setattr(
"huggingface_hub.file_download.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE
)
monkeypatch.setattr("huggingface_hub.constants.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE)
try:
# for backward compatibility with huggingface_hub 0.x
monkeypatch.setattr(
"huggingface_hub.file_download.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE
)
except AttributeError:
pass
old_environ = dict(os.environ)
os.environ["HF_ENDPOINT"] = CI_HUB_ENDPOINT
yield
Expand Down Expand Up @@ -107,18 +111,11 @@ def _hf_gated_dataset_repo_txt_data(hf_api: HfApi, hf_token, text_file_content):
repo_id=repo_id,
repo_type="dataset",
)
path = f"{hf_api.endpoint}/api/datasets/{repo_id}/settings"
repo_settings = {"gated": "auto"}
r = requests.put(
path,
headers={"authorization": f"Bearer {hf_token}"},
json=repo_settings,
)
hf_raise_for_status(r)
hf_api.update_repo_settings(repo_id, token=hf_token, repo_type="dataset", gated="auto")
yield repo_id
try:
hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error
except (HfHubHTTPError, ValueError): # catch http error and token invalid error
pass


Expand All @@ -142,7 +139,7 @@ def hf_private_dataset_repo_txt_data_(hf_api: HfApi, hf_token, text_file_content
yield repo_id
try:
hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error
except (HfHubHTTPError, ValueError): # catch http error and token invalid error
pass


Expand All @@ -166,7 +163,7 @@ def hf_private_dataset_repo_zipped_txt_data_(hf_api: HfApi, hf_token, zip_csv_wi
yield repo_id
try:
hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error
except (HfHubHTTPError, ValueError): # catch http error and token invalid error
pass


Expand All @@ -190,7 +187,7 @@ def hf_private_dataset_repo_zipped_img_data_(hf_api: HfApi, hf_token, zip_image_
yield repo_id
try:
hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error
except (HfHubHTTPError, ValueError): # catch http error and token invalid error
pass


Expand Down
25 changes: 9 additions & 16 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import dill
import pyarrow as pa
import pytest
import requests

import datasets
from datasets import config, load_dataset
Expand Down Expand Up @@ -767,10 +766,7 @@ def test_load_dataset_from_hub(self):
def test_load_dataset_namespace(self):
with self.assertRaises(DatasetNotFoundError) as context:
datasets.load_dataset("hf-internal-testing/_dummy")
self.assertIn(
"hf-internal-testing/_dummy",
str(context.exception),
)
self.assertIn("hf-internal-testing/_dummy", str(context.exception))
for offline_simulation_mode in list(OfflineSimulationMode):
with offline(offline_simulation_mode):
with self.assertRaises(ConnectionError) as context:
Expand Down Expand Up @@ -1050,19 +1046,16 @@ def test_load_dataset_with_unsupported_extensions(text_dir_with_unsupported_exte

@pytest.mark.integration
def test_loading_from_the_datasets_hub_with_token():
true_request = requests.Session().request

def assert_auth(method, url, *args, headers, **kwargs):
assert headers["authorization"] == "Bearer foo"
return true_request(method, url, *args, headers=headers, **kwargs)
class CustomException(Exception):
pass

with patch("requests.Session.request") as mock_request:
mock_request.side_effect = assert_auth
with patch("huggingface_hub.file_download._get_metadata_or_catch_error") as mock_request:
mock_request.side_effect = CustomException()
with tempfile.TemporaryDirectory() as tmp_dir:
with offline():
with pytest.raises((ConnectionError, requests.exceptions.ConnectionError)):
load_dataset(SAMPLE_NOT_EXISTING_DATASET_IDENTIFIER, cache_dir=tmp_dir, token="foo")
mock_request.assert_called()
with pytest.raises(CustomException):
load_dataset(SAMPLE_NOT_EXISTING_DATASET_IDENTIFIER, cache_dir=tmp_dir, token="foo")
mock_request.assert_called_once()
assert mock_request.call_args_list[0][1]["headers"]["authorization"] == "Bearer foo"


@pytest.mark.integration
Expand Down
Loading
Loading