Skip to content

Commit 05435fc

Browse files
authored
feat: Adds dataset.add_items_from_dir (#430)
* feat: Adds update_dataset_from_dir * _create_or_update_dataset_from_dir always create dataset * update test * Refactor using dataset.add_items_from_dir * Updates CHANGELOG and bump version * Check for items bigger than zero
1 parent d69a92e commit 05435fc

File tree

8 files changed

+128
-33
lines changed

8 files changed

+128
-33
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,15 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.17.0](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.0) - 2024-02-06
9+
10+
### Added
11+
- Added `dataset.add_items_from_dir`
12+
- Added pytest-xdist for test parallelization
13+
14+
### Fixes
15+
- Fix test `test_models.test_remove_invalid_tag_from_model`
16+
817

918
## [0.16.18](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.18) - 2024-02-06
1019

nucleus/__init__.py

Lines changed: 28 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1252,6 +1252,25 @@ def _set_api_key(self, api_key):
12521252

12531253
return api_key
12541254

1255+
@staticmethod
1256+
def valid_dirname(dirname) -> str:
1257+
"""
1258+
Validate directory exists
1259+
Args:
1260+
dirname: Path of directory
1261+
1262+
Returns:
1263+
Existing directory path
1264+
1265+
"""
1266+
# ensures path ends with a slash
1267+
_dirname = os.path.join(os.path.expanduser(dirname), "")
1268+
if not os.path.exists(_dirname):
1269+
raise ValueError(
1270+
f"Given directory name: {dirname} does not exists. Searched in {_dirname}"
1271+
)
1272+
return _dirname
1273+
12551274
def create_dataset_from_dir(
12561275
self,
12571276
dirname: str,
@@ -1260,7 +1279,7 @@ def create_dataset_from_dir(
12601279
privacy_mode_proxy: str = "",
12611280
allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"),
12621281
skip_size_warning: bool = False,
1263-
) -> Union[Dataset, None]:
1282+
) -> Dataset:
12641283
"""
12651284
Create a dataset by recursively crawling through a directory.
12661285
A DatasetItem will be created for each unique image found.
@@ -1274,39 +1293,16 @@ def create_dataset_from_dir(
12741293
allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
12751294
skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
12761295
"""
1277-
1278-
if use_privacy_mode:
1279-
assert (
1280-
privacy_mode_proxy
1281-
), "When using privacy mode, must specify a proxy to serve the files"
1282-
1283-
# ensures path ends with a slash
1284-
_dirname = os.path.join(os.path.expanduser(dirname), "")
1285-
if not os.path.exists(_dirname):
1286-
raise ValueError(
1287-
f"Given directory name: {dirname} does not exists. Searched in {_dirname}"
1288-
)
1289-
1290-
folder_name = os.path.basename(_dirname.rstrip("/"))
1296+
existing_dirname = self.valid_dirname(dirname)
1297+
folder_name = os.path.basename(existing_dirname.rstrip("/"))
12911298
dataset_name = dataset_name or folder_name
1292-
items = create_items_from_folder_crawl(
1293-
_dirname,
1294-
allowed_file_types,
1295-
use_privacy_mode,
1296-
privacy_mode_proxy,
1297-
)
1298-
1299-
if len(items) == 0:
1300-
print(f"Did not find any items in {dirname}")
1301-
return None
1302-
1303-
if len(items) > GLOB_SIZE_THRESHOLD_CHECK and not skip_size_warning:
1304-
raise Exception(
1305-
f"Found over {GLOB_SIZE_THRESHOLD_CHECK} items in {dirname}. If this is intended, set skip_size_warning=True when calling this function."
1306-
)
1307-
13081299
dataset = self.create_dataset(
13091300
name=dataset_name, use_privacy_mode=use_privacy_mode
13101301
)
1311-
dataset.append(items, asynchronous=False)
1302+
dataset.add_items_from_dir(
1303+
existing_dirname=existing_dirname,
1304+
privacy_mode_proxy=privacy_mode_proxy,
1305+
allowed_file_types=allowed_file_types,
1306+
skip_size_warning=skip_size_warning,
1307+
)
13121308
return dataset

nucleus/dataset.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from nucleus.url_utils import sanitize_string_args
2727
from nucleus.utils import (
2828
convert_export_payload,
29+
create_items_from_folder_crawl,
2930
format_dataset_item_response,
3031
format_prediction_response,
3132
format_scale_task_info_response,
@@ -50,6 +51,7 @@
5051
EXPORT_FOR_TRAINING_KEY,
5152
EXPORTED_ROWS,
5253
FRAME_RATE_KEY,
54+
GLOB_SIZE_THRESHOLD_CHECK,
5355
ITEM_KEY,
5456
ITEMS_KEY,
5557
JOB_REQ_LIMIT,
@@ -2241,3 +2243,55 @@ def jobs(
22412243
if stats_only:
22422244
return jobs_status_overview(job_objects)
22432245
return job_objects
2246+
2247+
def add_items_from_dir(
2248+
self,
2249+
dirname: Optional[str] = None,
2250+
existing_dirname: Optional[str] = None,
2251+
privacy_mode_proxy: str = "",
2252+
allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"),
2253+
skip_size_warning: bool = False,
2254+
update_items: bool = False,
2255+
):
2256+
"""
2257+
Update dataset by recursively crawling through a directory.
2258+
A DatasetItem will be created for each unique image found.
2259+
The existing items are skipped or updated depending on update_items param
2260+
2261+
Args:
2262+
dirname: Where to look for image files, recursively
2263+
existing_dirname: Already validated dirname
2264+
privacy_mode_proxy: Endpoint that serves image files for privacy mode, ignore if not using privacy mode.
2265+
The proxy should work based on the relative path of the images in the directory.
2266+
allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
2267+
skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
2268+
update_items: Whether to update items in existing dataset
2269+
"""
2270+
# fetch dataset use_privacy_mode for existence check
2271+
if self.use_privacy_mode:
2272+
assert (
2273+
privacy_mode_proxy
2274+
), "When using privacy mode, must specify a proxy to serve the files"
2275+
if not existing_dirname:
2276+
# ensures path ends with a slash
2277+
existing_dirname = self._client.valid_dirname(dirname)
2278+
items = create_items_from_folder_crawl(
2279+
existing_dirname,
2280+
allowed_file_types,
2281+
self.use_privacy_mode,
2282+
privacy_mode_proxy,
2283+
)
2284+
2285+
if len(items) > 0:
2286+
if (
2287+
len(items) > GLOB_SIZE_THRESHOLD_CHECK
2288+
and not skip_size_warning
2289+
):
2290+
raise Exception(
2291+
f"Found over {GLOB_SIZE_THRESHOLD_CHECK} items in {dirname}. If this is intended,"
2292+
f" set skip_size_warning=True when calling this function."
2293+
)
2294+
self.append(items, asynchronous=False, update=update_items)
2295+
2296+
else:
2297+
print(f"Did not find any items in {dirname}.")

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"] # Easy ignore for getting it running
2525

2626
[tool.poetry]
2727
name = "scale-nucleus"
28-
version = "0.16.18"
28+
version = "0.17.0"
2929
description = "The official Python client library for Nucleus, the Data Platform for AI"
3030
license = "MIT"
3131
authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]

tests/helpers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,7 @@ def reference_id_from_url(url):
459459

460460
this_dir = os.path.dirname(os.path.realpath(__file__))
461461
TEST_LOCAL_MASK_URL = os.path.join(this_dir, "testdata/000000000285.png")
462+
TEST_LOCAL_TESTDIR = os.path.join(this_dir, "testdata/testdir")
462463

463464

464465
NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET = len(TEST_DATASET_ITEMS)

tests/test_dataset.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import copy
2+
import glob
23
import math
4+
import os
35

46
import pytest
57

@@ -38,6 +40,7 @@
3840
TEST_DATASET_NAME,
3941
TEST_IMG_URLS,
4042
TEST_LIDAR_SCENES,
43+
TEST_LOCAL_TESTDIR,
4144
TEST_MULTICATEGORY_ANNOTATIONS,
4245
TEST_POLYGON_ANNOTATIONS,
4346
TEST_SEGMENTATION_ANNOTATIONS,
@@ -611,3 +614,35 @@ def test_query(CLIENT):
611614
with pytest.raises(NucleusAPIError):
612615
for qi in dataset.query_items("annotations.count bad syntax"):
613616
print(qi) # unreachable, just need to yield an item from generator
617+
618+
619+
@pytest.mark.integration
620+
def test_create_update_dataset_from_dir(CLIENT):
621+
reference_ids = set()
622+
for file_type in ["png", "jpeg"]:
623+
pathname = os.path.join(TEST_LOCAL_TESTDIR, f"**/*.{file_type}")
624+
reference_ids.update(
625+
path.replace(TEST_LOCAL_TESTDIR + "/", "")
626+
for path in glob.glob(pathname=pathname, recursive=True)
627+
)
628+
dataset = CLIENT.create_dataset_from_dir(
629+
TEST_LOCAL_TESTDIR, allowed_file_types=tuple(["exe"])
630+
)
631+
assert dataset is not None
632+
CLIENT.delete_dataset(dataset.id)
633+
dataset = CLIENT.create_dataset_from_dir(
634+
TEST_LOCAL_TESTDIR, allowed_file_types=tuple(["png"])
635+
)
636+
dataset_items = dataset.items
637+
assert len(dataset_items) == 1
638+
assert dataset_items[0].reference_id in reference_ids
639+
dataset.add_items_from_dir(
640+
dirname=TEST_LOCAL_TESTDIR,
641+
allowed_file_types=tuple(["png", "jpeg"]),
642+
)
643+
dataset_items = dataset.items
644+
assert len(dataset_items) == 2
645+
for dataset_item in dataset_items:
646+
assert dataset_item.reference_id in reference_ids
647+
reference_ids.remove(dataset_item.reference_id)
648+
CLIENT.delete_dataset(dataset.id)
2.75 KB
Loading

tests/testdata/testdir/airplane.jpeg

45.5 KB
Loading

0 commit comments

Comments
 (0)