Skip to content

Commit db6b48a

Browse files
committed
Refactor using dataset.add_items_from_dir
1 parent 29f3ddf commit db6b48a

File tree

3 files changed

+65
-102
lines changed

3 files changed

+65
-102
lines changed

nucleus/__init__.py

Lines changed: 14 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -1239,76 +1239,24 @@ def _set_api_key(self, api_key):
12391239

12401240
return api_key
12411241

1242-
def _create_or_update_dataset_from_dir(
1243-
self,
1244-
dirname: str,
1245-
dataset_id: Optional[str] = None,
1246-
dataset_name: Optional[str] = None,
1247-
use_privacy_mode: bool = False,
1248-
privacy_mode_proxy: str = "",
1249-
allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"),
1250-
skip_size_warning: bool = False,
1251-
update_items: bool = False,
1252-
) -> Dataset:
1242+
@staticmethod
1243+
def valid_dirname(dirname) -> str:
12531244
"""
1254-
Create or update dataset by recursively crawling through a directory.
1255-
A DatasetItem will be created for each unique image found.
1256-
In case of update the existing items are skipped or updated depending on update_items param
1257-
1245+
Validate directory exists
12581246
Args:
1259-
dirname: Where to look for image files, recursively
1260-
dataset_id: Dataset Id for update
1261-
dataset_name: If none is given, the parent folder name is used
1262-
use_privacy_mode: Whether the dataset should be treated as privacy (ignored if dataset being updated)
1263-
privacy_mode_proxy: Endpoint that serves image files for privacy mode, ignore if not using privacy mode.
1264-
The proxy should work based on the relative path of the images in the directory.
1265-
allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
1266-
skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
1267-
update_items: Whether to update items in existing dataset
1247+
dirname: Path of directory
12681248
12691249
Returns:
1270-
:class: `Dataset`: Created dataset or updated one
1250+
Existing directory path
12711251
12721252
"""
1273-
if dataset_id:
1274-
dataset = self.get_dataset(dataset_id)
1275-
# fetch dataset use_privacy_mode for existence check
1276-
use_privacy_mode = dataset.use_privacy_mode
1277-
else:
1278-
dataset = None
1279-
if use_privacy_mode:
1280-
assert (
1281-
privacy_mode_proxy
1282-
), "When using privacy mode, must specify a proxy to serve the files"
1283-
12841253
# ensures path ends with a slash
12851254
_dirname = os.path.join(os.path.expanduser(dirname), "")
12861255
if not os.path.exists(_dirname):
12871256
raise ValueError(
12881257
f"Given directory name: {dirname} does not exists. Searched in {_dirname}"
12891258
)
1290-
items = create_items_from_folder_crawl(
1291-
_dirname,
1292-
allowed_file_types,
1293-
use_privacy_mode,
1294-
privacy_mode_proxy,
1295-
)
1296-
1297-
if len(items) == 0:
1298-
print(f"Did not find any items in {dirname}. Creating empty dataset")
1299-
elif len(items) > GLOB_SIZE_THRESHOLD_CHECK and not skip_size_warning:
1300-
raise Exception(
1301-
f"Found over {GLOB_SIZE_THRESHOLD_CHECK} items in {dirname}. If this is intended, set skip_size_warning=True when calling this function."
1302-
)
1303-
1304-
if dataset is None:
1305-
folder_name = os.path.basename(_dirname.rstrip("/"))
1306-
dataset_name = dataset_name or folder_name
1307-
dataset = self.create_dataset(
1308-
name=dataset_name, use_privacy_mode=use_privacy_mode
1309-
)
1310-
dataset.append(items, asynchronous=False, update=update_items)
1311-
return dataset
1259+
return _dirname
13121260

13131261
def create_dataset_from_dir(
13141262
self,
@@ -1332,48 +1280,16 @@ def create_dataset_from_dir(
13321280
allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
13331281
skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
13341282
"""
1335-
return self._create_or_update_dataset_from_dir(
1336-
dirname,
1337-
dataset_name=dataset_name,
1338-
use_privacy_mode=use_privacy_mode,
1339-
privacy_mode_proxy=privacy_mode_proxy,
1340-
allowed_file_types=allowed_file_types,
1341-
skip_size_warning=skip_size_warning,
1283+
existing_dirname = self.valid_dirname(dirname)
1284+
folder_name = os.path.basename(existing_dirname.rstrip("/"))
1285+
dataset_name = dataset_name or folder_name
1286+
dataset = self.create_dataset(
1287+
name=dataset_name, use_privacy_mode=use_privacy_mode
13421288
)
1343-
1344-
def update_dataset_from_dir(
1345-
self,
1346-
dirname: str,
1347-
dataset_id: str,
1348-
privacy_mode_proxy: str = "",
1349-
allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"),
1350-
skip_size_warning: bool = False,
1351-
update_items: bool = False,
1352-
) -> Dataset:
1353-
"""
1354-
Update dataset by recursively crawling through a directory.
1355-
A DatasetItem will be created for each unique image found.
1356-
The existing items are skipped or updated depending on update_items param
1357-
1358-
Args:
1359-
dirname: Where to look for image files, recursively
1360-
dataset_id: ID of existing dataset to update
1361-
privacy_mode_proxy: Endpoint that serves image files for privacy mode, ignore if not using privacy mode.
1362-
The proxy should work based on the relative path of the images in the directory.
1363-
allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
1364-
skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
1365-
update_items: Whether to update items in existing dataset
1366-
1367-
Returns:
1368-
:class:`Dataset`: Updated dataset
1369-
"""
1370-
updated_dataset = self._create_or_update_dataset_from_dir(
1371-
dirname,
1372-
dataset_id=dataset_id,
1289+
dataset.add_items_from_dir(
1290+
existing_dirname=existing_dirname,
13731291
privacy_mode_proxy=privacy_mode_proxy,
13741292
allowed_file_types=allowed_file_types,
13751293
skip_size_warning=skip_size_warning,
1376-
update_items=update_items,
13771294
)
1378-
assert updated_dataset is not None
1379-
return updated_dataset
1295+
return dataset

nucleus/dataset.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
format_scale_task_info_response,
3232
paginate_generator,
3333
serialize_and_write_to_presigned_url,
34+
create_items_from_folder_crawl,
3435
)
3536

3637
from .annotation import Annotation, check_all_mask_paths_remote
@@ -69,6 +70,7 @@
6970
TRAINED_SLICE_ID_KEY,
7071
UPDATE_KEY,
7172
VIDEO_URL_KEY,
73+
GLOB_SIZE_THRESHOLD_CHECK,
7274
)
7375
from .data_transfer_object.dataset_info import DatasetInfo
7476
from .data_transfer_object.dataset_size import DatasetSize
@@ -726,7 +728,6 @@ def append(
726728
local_files_per_upload_request=local_files_per_upload_request,
727729
)
728730

729-
730731
@deprecated("Prefer using Dataset.append instead.")
731732
def append_scenes(
732733
self,
@@ -2242,3 +2243,50 @@ def jobs(
22422243
if stats_only:
22432244
return jobs_status_overview(job_objects)
22442245
return job_objects
2246+
2247+
def add_items_from_dir(
2248+
self,
2249+
dirname: Optional[str] = None,
2250+
existing_dirname: Optional[str] = None,
2251+
privacy_mode_proxy: str = "",
2252+
allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"),
2253+
skip_size_warning: bool = False,
2254+
update_items: bool = False,
2255+
):
2256+
"""
2257+
Update dataset by recursively crawling through a directory.
2258+
A DatasetItem will be created for each unique image found.
2259+
The existing items are skipped or updated depending on update_items param
2260+
2261+
Args:
2262+
dirname: Where to look for image files, recursively
2263+
existing_dirname: Already validated dirname
2264+
privacy_mode_proxy: Endpoint that serves image files for privacy mode, ignore if not using privacy mode.
2265+
The proxy should work based on the relative path of the images in the directory.
2266+
allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
2267+
skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
2268+
update_items: Whether to update items in existing dataset
2269+
"""
2270+
# fetch dataset use_privacy_mode for existence check
2271+
if self.use_privacy_mode:
2272+
assert (
2273+
privacy_mode_proxy
2274+
), "When using privacy mode, must specify a proxy to serve the files"
2275+
if not existing_dirname:
2276+
# ensures path ends with a slash
2277+
existing_dirname = self._client.valid_dirname(dirname)
2278+
items = create_items_from_folder_crawl(
2279+
existing_dirname,
2280+
allowed_file_types,
2281+
self.use_privacy_mode,
2282+
privacy_mode_proxy,
2283+
)
2284+
2285+
if len(items) == 0:
2286+
print(f"Did not find any items in {dirname}.")
2287+
elif len(items) > GLOB_SIZE_THRESHOLD_CHECK and not skip_size_warning:
2288+
raise Exception(
2289+
f"Found over {GLOB_SIZE_THRESHOLD_CHECK} items in {dirname}. If this is intended, set skip_size_warning=True when calling this function."
2290+
)
2291+
2292+
self.append(items, asynchronous=False, update=update_items)

tests/test_dataset.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -636,9 +636,8 @@ def test_create_update_dataset_from_dir(CLIENT):
636636
dataset_items = dataset.items
637637
assert len(dataset_items) == 1
638638
assert dataset_items[0].reference_id in reference_ids
639-
dataset = CLIENT.update_dataset_from_dir(
640-
TEST_LOCAL_TESTDIR,
641-
dataset_id=dataset.id,
639+
dataset.add_items_from_dir(
640+
dirname=TEST_LOCAL_TESTDIR,
642641
allowed_file_types=tuple(["png", "jpeg"]),
643642
)
644643
dataset_items = dataset.items

0 commit comments

Comments
 (0)