Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

No need for dataset_info #7234

Merged
merged 7 commits into from
Oct 21, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
no need for dataset_info
  • Loading branch information
lhoestq committed Oct 17, 2024
commit d07730a578470d11309d36fa443fddf414275029
47 changes: 25 additions & 22 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError, RevisionNotFoundError, get_session

from . import config
from . import __version__, config
from .arrow_dataset import Dataset
from .builder import BuilderConfig, DatasetBuilder
from .data_files import (
Expand Down Expand Up @@ -989,34 +989,37 @@ def __init__(
increase_load_count(name)

def get_module(self) -> DatasetModule:
hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
self.name,
revision=self.revision,
# Get the Dataset Card and fix the revision in case there are new commits in the meantime
api = HfApi(
endpoint=config.HF_ENDPOINT,
token=self.download_config.token,
timeout=100.0,
library_name="datasets",
library_version=__version__,
user_agent=get_datasets_user_agent(self.download_config.user_agent),
)
# even if metadata_configs is not None (which means that we will resolve files for each config later)
# we cannot skip resolving all files because we need to infer module name by files extensions
revision = hfh_dataset_info.sha # fix the revision in case there are new commits in the meantime
base_path = f"hf://datasets/{self.name}@{revision}/{self.data_dir or ''}".rstrip("/")

download_config = self.download_config.copy()
if download_config.download_desc is None:
download_config.download_desc = "Downloading readme"
try:
dataset_readme_path = cached_path(
hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=revision),
download_config=download_config,
dataset_readme_path = api.hf_hub_download(
repo_id=self.name,
filename=config.REPOCARD_FILENAME,
repo_type="dataset",
revision=self.revision,
proxies=self.download_config.proxies,
)
dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data
commit_hash = os.path.dirname(dataset_readme_path)
dataset_card_data = DatasetCard.load(dataset_readme_path).data
except FileNotFoundError:
commit_hash = api.dataset_info(
self.name,
revision=self.revision,
timeout=100.0,
).sha
dataset_card_data = DatasetCardData()
download_config = self.download_config.copy()
if download_config.download_desc is None:
download_config.download_desc = "Downloading standalone yaml"
try:
standalone_yaml_path = cached_path(
hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=revision),
hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=commit_hash),
download_config=download_config,
)
with open(standalone_yaml_path, "r", encoding="utf-8") as f:
Expand All @@ -1027,6 +1030,7 @@ def get_module(self) -> DatasetModule:
dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
except FileNotFoundError:
pass
base_path = f"hf://datasets/{self.name}@{commit_hash}/{self.data_dir or ''}".rstrip("/")
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
# Use the infos from the parquet export except in some cases:
Expand Down Expand Up @@ -1110,7 +1114,7 @@ def get_module(self) -> DatasetModule:
]
default_config_name = None
builder_kwargs = {
"base_path": hf_dataset_url(self.name, "", revision=revision).rstrip("/"),
"base_path": hf_dataset_url(self.name, "", revision=commit_hash).rstrip("/"),
"repo_id": self.name,
"dataset_name": camelcase_to_snakecase(Path(self.name).name),
}
Expand All @@ -1122,7 +1126,7 @@ def get_module(self) -> DatasetModule:
try:
# this file is deprecated and was created automatically in old versions of push_to_hub
dataset_infos_path = cached_path(
hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=revision),
hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=commit_hash),
download_config=download_config,
)
with open(dataset_infos_path, encoding="utf-8") as f:
Expand All @@ -1143,10 +1147,9 @@ def get_module(self) -> DatasetModule:
if default_config_name is None and len(dataset_infos) == 1:
default_config_name = next(iter(dataset_infos))

hash = revision
return DatasetModule(
module_path,
hash,
commit_hash,
builder_kwargs,
dataset_infos=dataset_infos,
builder_configs_parameters=BuilderConfigsParameters(
Expand Down
Loading