Skip to content

Commit 8916a0f

Browse files
committed
feat: Adds update_dataset_from_dir
1 parent b8a4305 commit 8916a0f

File tree

5 files changed

+133
-13
lines changed

5 files changed

+133
-13
lines changed

nucleus/__init__.py

Lines changed: 97 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1239,29 +1239,43 @@ def _set_api_key(self, api_key):
12391239

12401240
return api_key
12411241

1242-
def create_dataset_from_dir(
1242+
def _create_or_update_dataset_from_dir(
12431243
self,
12441244
dirname: str,
1245+
dataset_id: Optional[str] = None,
12451246
dataset_name: Optional[str] = None,
12461247
use_privacy_mode: bool = False,
12471248
privacy_mode_proxy: str = "",
12481249
allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"),
12491250
skip_size_warning: bool = False,
1251+
update_items: bool = False,
12501252
) -> Union[Dataset, None]:
12511253
"""
1252-
Create a dataset by recursively crawling through a directory.
1254+
Create or update dataset by recursively crawling through a directory.
12531255
A DatasetItem will be created for each unique image found.
1256+
In case of update the existing items are skipped or updated depending on update_items param
12541257
1255-
Parameters:
1258+
Args:
12561259
dirname: Where to look for image files, recursively
1260+
dataset_id: Dataset Id for update
12571261
dataset_name: If none is given, the parent folder name is used
1258-
use_privacy_mode: Whether the dataset should be treated as privacy
1262+
use_privacy_mode: Whether the dataset should be treated as privacy (ignored if dataset being updated)
12591263
privacy_mode_proxy: Endpoint that serves image files for privacy mode, ignore if not using privacy mode.
12601264
The proxy should work based on the relative path of the images in the directory.
12611265
allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
12621266
skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
1263-
"""
1267+
update_items: Whether to update items in existing dataset
1268+
1269+
Returns:
1270+
:class: `Union[Dataset, None]`: Updated dataset or None if there wasn't a dataset to create
12641271
1272+
"""
1273+
if dataset_id:
1274+
existing_dataset = self.get_dataset(dataset_id)
1275+
# fetch dataset use_privacy_mode for existence check
1276+
use_privacy_mode = existing_dataset.use_privacy_mode
1277+
else:
1278+
existing_dataset = None
12651279
if use_privacy_mode:
12661280
assert (
12671281
privacy_mode_proxy
@@ -1273,9 +1287,6 @@ def create_dataset_from_dir(
12731287
raise ValueError(
12741288
f"Given directory name: {dirname} does not exists. Searched in {_dirname}"
12751289
)
1276-
1277-
folder_name = os.path.basename(_dirname.rstrip("/"))
1278-
dataset_name = dataset_name or folder_name
12791290
items = create_items_from_folder_crawl(
12801291
_dirname,
12811292
allowed_file_types,
@@ -1285,15 +1296,88 @@ def create_dataset_from_dir(
12851296

12861297
if len(items) == 0:
12871298
print(f"Did not find any items in {dirname}")
1288-
return None
1299+
return existing_dataset
12891300

12901301
if len(items) > GLOB_SIZE_THRESHOLD_CHECK and not skip_size_warning:
12911302
raise Exception(
12921303
f"Found over {GLOB_SIZE_THRESHOLD_CHECK} items in {dirname}. If this is intended, set skip_size_warning=True when calling this function."
12931304
)
12941305

1295-
dataset = self.create_dataset(
1296-
name=dataset_name, use_privacy_mode=use_privacy_mode
1297-
)
1298-
dataset.append(items, asynchronous=False)
1306+
if existing_dataset:
1307+
dataset = existing_dataset
1308+
else:
1309+
folder_name = os.path.basename(_dirname.rstrip("/"))
1310+
dataset_name = dataset_name or folder_name
1311+
dataset = self.create_dataset(
1312+
name=dataset_name, use_privacy_mode=use_privacy_mode
1313+
)
1314+
dataset.append(items, asynchronous=False, update=update_items)
12991315
return dataset
1316+
1317+
def create_dataset_from_dir(
1318+
self,
1319+
dirname: str,
1320+
dataset_name: Optional[str] = None,
1321+
use_privacy_mode: bool = False,
1322+
privacy_mode_proxy: str = "",
1323+
allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"),
1324+
skip_size_warning: bool = False,
1325+
) -> Union[Dataset, None]:
1326+
"""
1327+
Create a dataset by recursively crawling through a directory.
1328+
A DatasetItem will be created for each unique image found.
1329+
1330+
Parameters:
1331+
dirname: Where to look for image files, recursively
1332+
dataset_name: If none is given, the parent folder name is used
1333+
use_privacy_mode: Whether the dataset should be treated as privacy
1334+
privacy_mode_proxy: Endpoint that serves image files for privacy mode, ignore if not using privacy mode.
1335+
The proxy should work based on the relative path of the images in the directory.
1336+
allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
1337+
skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
1338+
"""
1339+
return self._create_or_update_dataset_from_dir(
1340+
dirname,
1341+
dataset_name=dataset_name,
1342+
use_privacy_mode=use_privacy_mode,
1343+
privacy_mode_proxy=privacy_mode_proxy,
1344+
allowed_file_types=allowed_file_types,
1345+
skip_size_warning=skip_size_warning,
1346+
)
1347+
1348+
def update_dataset_from_dir(
1349+
self,
1350+
dirname: str,
1351+
dataset_id: str,
1352+
privacy_mode_proxy: str = "",
1353+
allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"),
1354+
skip_size_warning: bool = False,
1355+
update_items: bool = False,
1356+
) -> Dataset:
1357+
"""
1358+
update dataset by recursively crawling through a directory.
1359+
A DatasetItem will be created for each unique image found.
1360+
The existing items are skipped or updated depending on update_items param
1361+
1362+
Args:
1363+
dirname: Where to look for image files, recursively
1364+
dataset_id: ID of existing dataset to update
1365+
privacy_mode_proxy: Endpoint that serves image files for privacy mode, ignore if not using privacy mode.
1366+
The proxy should work based on the relative path of the images in the directory.
1367+
allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
1368+
skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
1369+
update_items: Whether to update items in existing dataset
1370+
1371+
Returns:
1372+
:class:`Dataset`: Updated dataset
1373+
"""
1374+
updated_dataset = self._create_or_update_dataset_from_dir(
1375+
dirname,
1376+
dataset_id=dataset_id,
1377+
privacy_mode_proxy=privacy_mode_proxy,
1378+
allowed_file_types=allowed_file_types,
1379+
skip_size_warning=skip_size_warning,
1380+
update_items=update_items,
1381+
)
1382+
assert updated_dataset is not None
1383+
return updated_dataset

tests/helpers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,7 @@ def reference_id_from_url(url):
459459

460460
this_dir = os.path.dirname(os.path.realpath(__file__))
461461
TEST_LOCAL_MASK_URL = os.path.join(this_dir, "testdata/000000000285.png")
462+
TEST_LOCAL_TESTDIR = os.path.join(this_dir, "testdata/testdir")
462463

463464

464465
NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET = len(TEST_DATASET_ITEMS)

tests/test_dataset.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import copy
2+
import glob
23
import math
4+
import os
35

46
import pytest
57

@@ -44,6 +46,7 @@
4446
TEST_VIDEO_SCENES,
4547
assert_partial_equality,
4648
reference_id_from_url,
49+
TEST_LOCAL_TESTDIR,
4750
)
4851

4952

@@ -611,3 +614,35 @@ def test_query(CLIENT):
611614
with pytest.raises(NucleusAPIError):
612615
for qi in dataset.query_items("annotations.count bad syntax"):
613616
print(qi) # unreachable, just need to yield an item from generator
617+
618+
619+
@pytest.mark.integration
620+
def test_create_update_dataset_from_dir(CLIENT):
621+
reference_ids = set()
622+
for file_type in ["png", "jpeg"]:
623+
pathname = os.path.join(TEST_LOCAL_TESTDIR, f"**/*.{file_type}")
624+
reference_ids.update(
625+
path.replace(TEST_LOCAL_TESTDIR + "/", "")
626+
for path in glob.glob(pathname=pathname, recursive=True)
627+
)
628+
dataset = CLIENT.create_dataset_from_dir(
629+
TEST_LOCAL_TESTDIR, allowed_file_types=tuple(["exe"])
630+
)
631+
assert dataset is None
632+
dataset = CLIENT.create_dataset_from_dir(
633+
TEST_LOCAL_TESTDIR, allowed_file_types=tuple(["png"])
634+
)
635+
dataset_items = dataset.items
636+
assert len(dataset_items) == 1
637+
assert dataset_items[0].reference_id in reference_ids
638+
dataset = CLIENT.update_dataset_from_dir(
639+
TEST_LOCAL_TESTDIR,
640+
dataset_id=dataset.id,
641+
allowed_file_types=tuple(["png", "jpeg"]),
642+
)
643+
dataset_items = dataset.items
644+
assert len(dataset_items) == 2
645+
for dataset_item in dataset_items:
646+
assert dataset_item.reference_id in reference_ids
647+
reference_ids.remove(dataset_item.reference_id)
648+
CLIENT.delete_dataset(dataset.id)
2.75 KB
Loading

tests/testdata/testdir/airplane.jpeg

45.5 KB
Loading

0 commit comments

Comments
 (0)