Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update load_dataset doctring #7301

Merged
merged 4 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
style
  • Loading branch information
lhoestq committed Nov 28, 2024
commit fae39ebc75ba9d83bcedd55740bc432034041109
8 changes: 5 additions & 3 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,11 @@ def __reduce__(self): # to make dynamically created class pickable, see _Initia
def get_dataset_builder_class(
dataset_module: "DatasetModule", dataset_name: Optional[str] = None
) -> Type[DatasetBuilder]:
with lock_importable_file(
dataset_module.importable_file_path
) if dataset_module.importable_file_path else nullcontext():
with (
lock_importable_file(dataset_module.importable_file_path)
if dataset_module.importable_file_path
else nullcontext()
):
builder_cls = import_main_class(dataset_module.module_path)
if dataset_module.builder_configs_parameters.builder_configs:
dataset_name = dataset_name or dataset_module.builder_kwargs.get("dataset_name")
Expand Down
41 changes: 24 additions & 17 deletions tests/test_arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2717,9 +2717,11 @@ def test_format_vectors(self, in_memory):
import tensorflow as tf
import torch

with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
in_memory, tmp_dir
) as dset, dset.map(lambda ex, i: {"vec": np.ones(3) * i}, with_indices=True) as dset:
with (
tempfile.TemporaryDirectory() as tmp_dir,
self._create_dummy_dataset(in_memory, tmp_dir) as dset,
dset.map(lambda ex, i: {"vec": np.ones(3) * i}, with_indices=True) as dset,
):
columns = dset.column_names

self.assertIsNotNone(dset[0])
Expand Down Expand Up @@ -2770,9 +2772,11 @@ def test_format_ragged_vectors(self, in_memory):
import tensorflow as tf
import torch

with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
in_memory, tmp_dir
) as dset, dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i}, with_indices=True) as dset:
with (
tempfile.TemporaryDirectory() as tmp_dir,
self._create_dummy_dataset(in_memory, tmp_dir) as dset,
dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i}, with_indices=True) as dset,
):
columns = dset.column_names

self.assertIsNotNone(dset[0])
Expand Down Expand Up @@ -2830,9 +2834,11 @@ def test_format_nested(self, in_memory):
import tensorflow as tf
import torch

with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
in_memory, tmp_dir
) as dset, dset.map(lambda ex: {"nested": [{"foo": np.ones(3)}] * len(ex["filename"])}, batched=True) as dset:
with (
tempfile.TemporaryDirectory() as tmp_dir,
self._create_dummy_dataset(in_memory, tmp_dir) as dset,
dset.map(lambda ex: {"nested": [{"foo": np.ones(3)}] * len(ex["filename"])}, batched=True) as dset,
):
self.assertDictEqual(
dset.features, Features({"filename": Value("string"), "nested": {"foo": Sequence(Value("float64"))}})
)
Expand Down Expand Up @@ -3224,11 +3230,11 @@ def test_concatenate_mixed_memory_and_disk(self):
info1 = DatasetInfo(description="Dataset1")
info2 = DatasetInfo(description="Dataset2")
with tempfile.TemporaryDirectory() as tmp_dir:
with Dataset.from_dict(data1, info=info1).map(
cache_file_name=os.path.join(tmp_dir, "d1.arrow")
) as dset1, Dataset.from_dict(data2, info=info2).map(
cache_file_name=os.path.join(tmp_dir, "d2.arrow")
) as dset2, Dataset.from_dict(data3) as dset3:
with (
Dataset.from_dict(data1, info=info1).map(cache_file_name=os.path.join(tmp_dir, "d1.arrow")) as dset1,
Dataset.from_dict(data2, info=info2).map(cache_file_name=os.path.join(tmp_dir, "d2.arrow")) as dset2,
Dataset.from_dict(data3) as dset3,
):
with concatenate_datasets([dset1, dset2, dset3]) as concatenated_dset:
self.assertEqual(len(concatenated_dset), len(dset1) + len(dset2) + len(dset3))
self.assertListEqual(concatenated_dset["id"], dset1["id"] + dset2["id"] + dset3["id"])
Expand Down Expand Up @@ -4130,9 +4136,10 @@ def test_dataset_to_json(dataset, tmp_path):
)
def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_params, arrow_file):
method, args, kwargs = method_and_params
with Dataset.from_file(arrow_file, in_memory=in_memory) as dataset, Dataset.from_file(
arrow_file, in_memory=in_memory
) as reference_dataset:
with (
Dataset.from_file(arrow_file, in_memory=in_memory) as dataset,
Dataset.from_file(arrow_file, in_memory=in_memory) as reference_dataset,
):
out = getattr(dataset, method)(*args, **kwargs)
dataset = out if out is not None else dataset
pickled_dataset = pickle.dumps(dataset)
Expand Down
7 changes: 4 additions & 3 deletions tests/test_py_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,10 @@ class Foo:
],
)
def test_map_nested_num_proc(iterable_length, num_proc, expected_num_proc):
with patch("datasets.utils.py_utils._single_map_nested") as mock_single_map_nested, patch(
"datasets.parallel.parallel.Pool"
) as mock_multiprocessing_pool:
with (
patch("datasets.utils.py_utils._single_map_nested") as mock_single_map_nested,
patch("datasets.parallel.parallel.Pool") as mock_multiprocessing_pool,
):
data_struct = {f"{i}": i for i in range(iterable_length)}
_ = map_nested(lambda x: x + 10, data_struct, num_proc=num_proc, parallel_min_length=16)
if expected_num_proc == 1:
Expand Down
16 changes: 10 additions & 6 deletions tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,11 @@ def test_add_elasticsearch_index(self):
from elasticsearch import Elasticsearch

dset: Dataset = self._create_dummy_dataset()
with patch("elasticsearch.Elasticsearch.search") as mocked_search, patch(
"elasticsearch.client.IndicesClient.create"
) as mocked_index_create, patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk:
with (
patch("elasticsearch.Elasticsearch.search") as mocked_search,
patch("elasticsearch.client.IndicesClient.create") as mocked_index_create,
patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk,
):
mocked_index_create.return_value = {"acknowledged": True}
mocked_bulk.return_value([(True, None)] * 30)
mocked_search.return_value = {"hits": {"hits": [{"_score": 1, "_id": 29}]}}
Expand Down Expand Up @@ -198,9 +200,11 @@ class ElasticSearchIndexTest(TestCase):
def test_elasticsearch(self):
from elasticsearch import Elasticsearch

with patch("elasticsearch.Elasticsearch.search") as mocked_search, patch(
"elasticsearch.client.IndicesClient.create"
) as mocked_index_create, patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk:
with (
patch("elasticsearch.Elasticsearch.search") as mocked_search,
patch("elasticsearch.client.IndicesClient.create") as mocked_index_create,
patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk,
):
es_client = Elasticsearch()
mocked_index_create.return_value = {"acknowledged": True}
index = ElasticSearchIndex(es_client=es_client)
Expand Down
5 changes: 3 additions & 2 deletions tests/test_upstream_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,9 @@ def test_push_dataset_dict_to_hub_with_multiple_commits(self, temporary_repo):
with temporary_repo() as ds_name:
self._api.create_repo(ds_name, token=self._token, repo_type="dataset")
num_commits_before_push = len(self._api.list_repo_commits(ds_name, repo_type="dataset", token=self._token))
with patch("datasets.config.MAX_SHARD_SIZE", "16KB"), patch(
"datasets.config.UPLOADS_MAX_NUMBER_PER_COMMIT", 1
with (
patch("datasets.config.MAX_SHARD_SIZE", "16KB"),
patch("datasets.config.UPLOADS_MAX_NUMBER_PER_COMMIT", 1),
):
local_ds.push_to_hub(ds_name, token=self._token)
hub_ds = load_dataset(ds_name, download_mode="force_redownload")
Expand Down
Loading