@@ -38,7 +38,9 @@ def subsample_streaming_dataset(
38
38
39
39
# Make sure input_dir contains cache path and remote url
40
40
if _should_replace_path (input_dir .path ):
41
- cache_path = _try_create_cache_dir (input_dir = input_dir .path if input_dir .path else input_dir .url )
41
+ cache_path = _try_create_cache_dir (
42
+ input_dir = input_dir .path if input_dir .path else input_dir .url , storage_options = storage_options
43
+ )
42
44
if cache_path is not None :
43
45
input_dir .path = cache_path
44
46
@@ -96,7 +98,7 @@ def _should_replace_path(path: Optional[str]) -> bool:
96
98
return path .startswith ("/teamspace/datasets/" ) or path .startswith ("/teamspace/s3_connections/" )
97
99
98
100
99
- def _read_updated_at (input_dir : Optional [Dir ]) -> str :
101
+ def _read_updated_at (input_dir : Optional [Dir ], storage_options : Optional [ Dict ] = {} ) -> str :
100
102
"""Read last updated timestamp from index.json file."""
101
103
last_updation_timestamp = "0"
102
104
index_json_content = None
@@ -110,7 +112,7 @@ def _read_updated_at(input_dir: Optional[Dir]) -> str:
110
112
# download index.json file and read last_updation_timestamp
111
113
with tempfile .TemporaryDirectory () as tmp_directory :
112
114
temp_index_filepath = os .path .join (tmp_directory , _INDEX_FILENAME )
113
- downloader = get_downloader_cls (input_dir .url , tmp_directory , [])
115
+ downloader = get_downloader_cls (input_dir .url , tmp_directory , [], storage_options )
114
116
downloader .download_file (os .path .join (input_dir .url , _INDEX_FILENAME ), temp_index_filepath )
115
117
116
118
index_json_content = load_index_file (tmp_directory )
@@ -135,9 +137,9 @@ def _clear_cache_dir_if_updated(input_dir_hash_filepath: str, updated_at_hash: s
135
137
shutil .rmtree (input_dir_hash_filepath )
136
138
137
139
138
- def _try_create_cache_dir (input_dir : Optional [str ]) -> Optional [str ]:
140
+ def _try_create_cache_dir (input_dir : Optional [str ], storage_options : Optional [ Dict ] = {} ) -> Optional [str ]:
139
141
resolved_input_dir = _resolve_dir (input_dir )
140
- updated_at = _read_updated_at (resolved_input_dir )
142
+ updated_at = _read_updated_at (resolved_input_dir , storage_options )
141
143
142
144
if updated_at == "0" and input_dir is not None :
143
145
updated_at = hashlib .md5 (input_dir .encode ()).hexdigest () # noqa: S324
0 commit comments