Skip to content

feat: Adds update_dataset_from_dir #430

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.17.0](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.0) - 2024-02-06

### Added
- Added `dataset.add_items_from_dir`
- Added pytest-xdist for test parallelization

### Fixes
- Fix test `test_models.test_remove_invalid_tag_from_model`


## [0.16.18](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.18) - 2024-02-06

Expand Down
60 changes: 28 additions & 32 deletions nucleus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1252,6 +1252,25 @@ def _set_api_key(self, api_key):

return api_key

@staticmethod
def valid_dirname(dirname) -> str:
"""
Validate directory exists
Args:
dirname: Path of directory

Returns:
Existing directory path

"""
# ensures path ends with a slash
_dirname = os.path.join(os.path.expanduser(dirname), "")
if not os.path.exists(_dirname):
raise ValueError(
f"Given directory name: {dirname} does not exists. Searched in {_dirname}"
)
return _dirname

def create_dataset_from_dir(
self,
dirname: str,
Expand All @@ -1260,7 +1279,7 @@ def create_dataset_from_dir(
privacy_mode_proxy: str = "",
allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"),
skip_size_warning: bool = False,
) -> Union[Dataset, None]:
) -> Dataset:
"""
Create a dataset by recursively crawling through a directory.
A DatasetItem will be created for each unique image found.
Expand All @@ -1274,39 +1293,16 @@ def create_dataset_from_dir(
allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
"""

if use_privacy_mode:
assert (
privacy_mode_proxy
), "When using privacy mode, must specify a proxy to serve the files"

# ensures path ends with a slash
_dirname = os.path.join(os.path.expanduser(dirname), "")
if not os.path.exists(_dirname):
raise ValueError(
f"Given directory name: {dirname} does not exists. Searched in {_dirname}"
)

folder_name = os.path.basename(_dirname.rstrip("/"))
existing_dirname = self.valid_dirname(dirname)
folder_name = os.path.basename(existing_dirname.rstrip("/"))
dataset_name = dataset_name or folder_name
items = create_items_from_folder_crawl(
_dirname,
allowed_file_types,
use_privacy_mode,
privacy_mode_proxy,
)

if len(items) == 0:
print(f"Did not find any items in {dirname}")
return None

if len(items) > GLOB_SIZE_THRESHOLD_CHECK and not skip_size_warning:
raise Exception(
f"Found over {GLOB_SIZE_THRESHOLD_CHECK} items in {dirname}. If this is intended, set skip_size_warning=True when calling this function."
)

dataset = self.create_dataset(
name=dataset_name, use_privacy_mode=use_privacy_mode
)
dataset.append(items, asynchronous=False)
dataset.add_items_from_dir(
existing_dirname=existing_dirname,
privacy_mode_proxy=privacy_mode_proxy,
allowed_file_types=allowed_file_types,
skip_size_warning=skip_size_warning,
)
return dataset
54 changes: 54 additions & 0 deletions nucleus/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from nucleus.url_utils import sanitize_string_args
from nucleus.utils import (
convert_export_payload,
create_items_from_folder_crawl,
format_dataset_item_response,
format_prediction_response,
format_scale_task_info_response,
Expand All @@ -50,6 +51,7 @@
EXPORT_FOR_TRAINING_KEY,
EXPORTED_ROWS,
FRAME_RATE_KEY,
GLOB_SIZE_THRESHOLD_CHECK,
ITEM_KEY,
ITEMS_KEY,
JOB_REQ_LIMIT,
Expand Down Expand Up @@ -2241,3 +2243,55 @@ def jobs(
if stats_only:
return jobs_status_overview(job_objects)
return job_objects

def add_items_from_dir(
self,
dirname: Optional[str] = None,
existing_dirname: Optional[str] = None,
privacy_mode_proxy: str = "",
allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"),
skip_size_warning: bool = False,
update_items: bool = False,
):
"""
Update dataset by recursively crawling through a directory.
A DatasetItem will be created for each unique image found.
The existing items are skipped or updated depending on update_items param

Args:
dirname: Where to look for image files, recursively
existing_dirname: Already validated dirname
privacy_mode_proxy: Endpoint that serves image files for privacy mode, ignore if not using privacy mode.
The proxy should work based on the relative path of the images in the directory.
allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
update_items: Whether to update items in existing dataset
"""
# fetch dataset use_privacy_mode for existence check
if self.use_privacy_mode:
assert (
privacy_mode_proxy
), "When using privacy mode, must specify a proxy to serve the files"
if not existing_dirname:
# ensures path ends with a slash
existing_dirname = self._client.valid_dirname(dirname)
items = create_items_from_folder_crawl(
existing_dirname,
allowed_file_types,
self.use_privacy_mode,
privacy_mode_proxy,
)

if len(items) > 0:
if (
len(items) > GLOB_SIZE_THRESHOLD_CHECK
and not skip_size_warning
):
raise Exception(
f"Found over {GLOB_SIZE_THRESHOLD_CHECK} items in {dirname}. If this is intended,"
f" set skip_size_warning=True when calling this function."
)
self.append(items, asynchronous=False, update=update_items)

else:
print(f"Did not find any items in {dirname}.")
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"] # Easy ignore for getting it running

[tool.poetry]
name = "scale-nucleus"
version = "0.16.18"
version = "0.17.0"
description = "The official Python client library for Nucleus, the Data Platform for AI"
license = "MIT"
authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
Expand Down
1 change: 1 addition & 0 deletions tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,7 @@ def reference_id_from_url(url):

this_dir = os.path.dirname(os.path.realpath(__file__))
TEST_LOCAL_MASK_URL = os.path.join(this_dir, "testdata/000000000285.png")
TEST_LOCAL_TESTDIR = os.path.join(this_dir, "testdata/testdir")


NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET = len(TEST_DATASET_ITEMS)
Expand Down
35 changes: 35 additions & 0 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import copy
import glob
import math
import os

import pytest

Expand Down Expand Up @@ -38,6 +40,7 @@
TEST_DATASET_NAME,
TEST_IMG_URLS,
TEST_LIDAR_SCENES,
TEST_LOCAL_TESTDIR,
TEST_MULTICATEGORY_ANNOTATIONS,
TEST_POLYGON_ANNOTATIONS,
TEST_SEGMENTATION_ANNOTATIONS,
Expand Down Expand Up @@ -611,3 +614,35 @@ def test_query(CLIENT):
with pytest.raises(NucleusAPIError):
for qi in dataset.query_items("annotations.count bad syntax"):
print(qi) # unreachable, just need to yield an item from generator


@pytest.mark.integration
def test_create_update_dataset_from_dir(CLIENT):
reference_ids = set()
for file_type in ["png", "jpeg"]:
pathname = os.path.join(TEST_LOCAL_TESTDIR, f"**/*.{file_type}")
reference_ids.update(
path.replace(TEST_LOCAL_TESTDIR + "/", "")
for path in glob.glob(pathname=pathname, recursive=True)
)
dataset = CLIENT.create_dataset_from_dir(
TEST_LOCAL_TESTDIR, allowed_file_types=tuple(["exe"])
)
assert dataset is not None
CLIENT.delete_dataset(dataset.id)
dataset = CLIENT.create_dataset_from_dir(
TEST_LOCAL_TESTDIR, allowed_file_types=tuple(["png"])
)
dataset_items = dataset.items
assert len(dataset_items) == 1
assert dataset_items[0].reference_id in reference_ids
dataset.add_items_from_dir(
dirname=TEST_LOCAL_TESTDIR,
allowed_file_types=tuple(["png", "jpeg"]),
)
dataset_items = dataset.items
assert len(dataset_items) == 2
for dataset_item in dataset_items:
assert dataset_item.reference_id in reference_ids
reference_ids.remove(dataset_item.reference_id)
CLIENT.delete_dataset(dataset.id)
Binary file added tests/testdata/testdir/000000000285.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/testdata/testdir/airplane.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.