huggingface · albertvillanova · Apr 22, 2021 · Apr 22, 2021 · Apr 22, 2021 · Apr 22, 2021
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -491,6 +491,15 @@ def download_and_prepare(
                     use_etag=False,
                     use_auth_token=use_auth_token,
                 )  # We don't use etag for data files to speed up the process
+            else:
+                if not download_config.cache_dir:
+                    download_config.cache_dir = os.path.join(self._cache_dir_root, "downloads")
+                if not download_config._is_force_download_set_by_user:
+                    download_config.force_download = bool(download_mode == GenerateMode.FORCE_REDOWNLOAD)
+                if not download_config._is_use_etag_set_by_user:
+                    download_config.use_etag = False
+                if download_config.use_auth_token is None:
+                    download_config.use_auth_token = use_auth_token
 
             dl_manager = DownloadManager(
                 dataset_name=self.name,
@@ -631,7 +640,19 @@ def _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs
         # Generating data for all splits
         split_dict = SplitDict(dataset_name=self.name)
         split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs)
-        split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
+        try:
+            split_generators = self._split_generators(
+                dl_manager, splits=dl_manager._download_config.splits, **split_generators_kwargs
+            )
+        except TypeError:
+            split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
+        # For downloaded splits not filtered by self._split_generators, filter now to avoid caching at least
+        if dl_manager._download_config.splits:
+            split_generators = [
+                split_generator
+                for split_generator in split_generators
+                if split_generator.name in dl_manager._download_config.splits
+            ]
 
         # Checksums verification
         if verify_infos:

diff --git a/src/datasets/utils/download_manager.py b/src/datasets/utils/download_manager.py
@@ -18,6 +18,7 @@
 
 import enum
 import os
+from collections import defaultdict
 from datetime import datetime
 from functools import partial
 from typing import Dict, Optional, Union
@@ -184,6 +185,10 @@ def download(self, url_or_urls):
             downloaded_path(s): `str`, The downloaded paths matching the given input
                 url_or_urls.
         """
+        if self._download_config.splits:
+            if isinstance(url_or_urls, dict) and all(split in url_or_urls for split in self._download_config.splits):
+                url_or_urls = {split: url_or_urls[split] for split in self._download_config.splits}
+
         download_config = self._download_config.copy()
         download_config.extract_compressed_file = False
         # Default to using 16 parallel thread for downloading
@@ -268,7 +273,11 @@ def extract(self, path_or_paths, num_proc=None):
         path_or_paths = NestedDataStructure(path_or_paths)
         extracted_paths = NestedDataStructure(extracted_paths)
         self.extracted_paths.update(dict(zip(path_or_paths.flatten(), extracted_paths.flatten())))
-        return extracted_paths.data
+        return (
+            extracted_paths.data
+            if not isinstance(extracted_paths.data, dict)
+            else defaultdict(lambda: "<NOT_DOWNLOADED>", extracted_paths.data)
+        )
 
     def download_and_extract(self, url_or_urls):
         """Download and extract given url_or_urls.

diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
@@ -228,17 +228,28 @@ class DownloadConfig:
     """
 
     cache_dir: Optional[Union[str, Path]] = None
-    force_download: bool = False
+    force_download: Optional[bool] = None  # default False
+    _is_force_download_set_by_user: bool = True
     resume_download: bool = False
     local_files_only: bool = False
     proxies: Optional[Dict] = None
     user_agent: Optional[str] = None
     extract_compressed_file: bool = False
     force_extract: bool = False
-    use_etag: bool = True
+    use_etag: Optional[bool] = None  # default True
+    _is_use_etag_set_by_user: bool = True
     num_proc: Optional[int] = None
     max_retries: int = 1
     use_auth_token: Optional[Union[str, bool]] = None
+    splits: Optional[list] = None
+
+    def __post_init__(self):
+        if self.use_etag is None:
+            self.use_etag = True
+            self._is_use_etag_set_by_user = False
+        if self.force_download is None:
+            self.force_download = False
+            self._is_force_download_set_by_user = False
 
     def copy(self) -> "DownloadConfig":
         return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})

diff --git a/src/datasets/utils/mock_download_manager.py b/src/datasets/utils/mock_download_manager.py
@@ -22,7 +22,7 @@
 from pathlib import Path
 from typing import Callable, List, Optional, Union
 
-from .file_utils import cached_path, hf_github_url
+from .file_utils import DownloadConfig, cached_path, hf_github_url
 from .logging import get_logger
 from .version import Version
 
@@ -61,6 +61,9 @@ def __init__(
         self._dummy_file = None
         self._bucket_url = None
 
+        # As DownloadManager
+        self._download_config = DownloadConfig()
+
     @property
     def dummy_file(self):
         if self._dummy_file is None:

diff --git a/tests/test_load.py b/tests/test_load.py
@@ -5,6 +5,7 @@
 import time
 from functools import partial
 from hashlib import sha256
+from pathlib import Path
 from unittest import TestCase
 from unittest.mock import patch
 
@@ -17,6 +18,7 @@
 from datasets.dataset_dict import DatasetDict, IterableDatasetDict
 from datasets.iterable_dataset import IterableDataset
 from datasets.load import prepare_module
+from datasets.utils.file_utils import DownloadConfig
 
 from .utils import (
     OfflineSimulationMode,
@@ -317,3 +319,39 @@ def test_load_from_disk_with_default_in_memory(
 
     with assert_arrow_memory_increases() if expected_in_memory else assert_arrow_memory_doesnt_increase():
         _ = load_from_disk(dataset_path)
+
+
+class TestLoadDatasetOnlySplits:
 data_exists = os.path.exists(self._cache_dir) 
 if data_exists and download_mode == GenerateMode.REUSE_DATASET_IF_EXISTS: 
     logger.warning("Reusing dataset %s (%s)", self.name, self._cache_dir) 
     # We need to update the info in case some splits were added in the meantime 
     # for example when calling load_dataset from multiple workers. 
     self.info = self._load_info() 
     self.download_post_processing_resources(dl_manager) 
     return 
 data_exists = os.path.exists(self._cache_dir) 
 if data_exists and download_mode == GenerateMode.REUSE_DATASET_IF_EXISTS: 
     logger.warning("Reusing dataset %s (%s)", self.name, self._cache_dir) 
     # We need to update the info in case some splits were added in the meantime 
     # for example when calling load_dataset from multiple workers. 
     self.info = self._load_info() 
     self.download_post_processing_resources(dl_manager) 
     return 
+    def test_load_dataset_local_only_splits_processed_files(self, dataset_loading_script_dir, data_dir, tmp_path):
+        download_config = DownloadConfig(splits=["test"])
+        cache_dir = str(tmp_path / "cache")
+        datasetdict = datasets.load_dataset(
+            dataset_loading_script_dir,
+            data_dir=data_dir,
+            cache_dir=cache_dir,
+            download_config=download_config,
+        )
+        assert isinstance(datasetdict, DatasetDict)
+        assert "train" not in datasetdict
+        assert "test" in datasetdict
+        dataset = datasetdict["test"]
+        assert dataset.split == "test"
+        assert dataset.shape == (10, 1)
+        # pattern = "*/0.0.0/74c0095031cf868e2486de6e08bb3ca4a9f9de3a81b10af67a42aed21393e640/*.arrow"
+        generated_arrow_files = sorted(Path(cache_dir, dataset.builder_name).glob("**/*.arrow"))
+        assert len(generated_arrow_files) == 1
+
+    def test_load_dataset_from_hub_only_splits_downloaded_files(self, tmp_path):
+        download_config = DownloadConfig(splits=["train"])
+        cache_dir = str(tmp_path / "cache")
+        datasetdict = load_dataset(SAMPLE_DATASET_IDENTIFIER, cache_dir=cache_dir, download_config=download_config)
+        assert isinstance(datasetdict, DatasetDict)
+        assert "train" in datasetdict
+        assert "validation" not in datasetdict
+        dataset = datasetdict["train"]
+        assert dataset.split == "train"
+        assert dataset.shape == (2, 1)
+        downloaded_files = set(str(path.stem) for path in Path(cache_dir, "downloads").glob("**/*"))
+        assert len(downloaded_files) == 1
+        generated_arrow_files = sorted(Path(cache_dir, dataset.builder_name).glob("**/*.arrow"))
+        assert len(generated_arrow_files) == 1