feat: Adds dataset.add_items_from_dir (#430)

nicolastomeo · web-flow · commit 05435fce10b5 · 2024-02-06T16:41:51.000+01:00
* feat: Adds update_dataset_from_dir

* _create_or_update_dataset_from_dir always create dataset

* update test

* Refactor using dataset.add_items_from_dir

* Updates CHANGELOG and bump version

* Check for items bigger than zero
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,15 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.17.0](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.0) - 2024-02-06
+
+### Added
+- Added `dataset.add_items_from_dir`
+- Added pytest-xdist for test parallelization
+
+### Fixes
+- Fix test `test_models.test_remove_invalid_tag_from_model`
+
 
 ## [0.16.18](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.18) - 2024-02-06
 
diff --git a/nucleus/__init__.py b/nucleus/__init__.py
@@ -1252,6 +1252,25 @@ def _set_api_key(self, api_key):
 
         return api_key
 
+    @staticmethod
+    def valid_dirname(dirname) -> str:
+        """
+        Validate directory exists
+        Args:
+            dirname: Path of directory
+
+        Returns:
+            Existing directory path
+
+        """
+        # ensures path ends with a slash
+        _dirname = os.path.join(os.path.expanduser(dirname), "")
+        if not os.path.exists(_dirname):
+            raise ValueError(
+                f"Given directory name: {dirname} does not exists. Searched in {_dirname}"
+            )
+        return _dirname
+
     def create_dataset_from_dir(
         self,
         dirname: str,
@@ -1260,7 +1279,7 @@ def create_dataset_from_dir(
         privacy_mode_proxy: str = "",
         allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"),
         skip_size_warning: bool = False,
-    ) -> Union[Dataset, None]:
+    ) -> Dataset:
         """
         Create a dataset by recursively crawling through a directory.
         A DatasetItem will be created for each unique image found.
@@ -1274,39 +1293,16 @@ def create_dataset_from_dir(
             allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
             skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
         """
-
-        if use_privacy_mode:
-            assert (
-                privacy_mode_proxy
-            ), "When using privacy mode, must specify a proxy to serve the files"
-
-        # ensures path ends with a slash
-        _dirname = os.path.join(os.path.expanduser(dirname), "")
-        if not os.path.exists(_dirname):
-            raise ValueError(
-                f"Given directory name: {dirname} does not exists. Searched in {_dirname}"
-            )
-
-        folder_name = os.path.basename(_dirname.rstrip("/"))
+        existing_dirname = self.valid_dirname(dirname)
+        folder_name = os.path.basename(existing_dirname.rstrip("/"))
         dataset_name = dataset_name or folder_name
-        items = create_items_from_folder_crawl(
-            _dirname,
-            allowed_file_types,
-            use_privacy_mode,
-            privacy_mode_proxy,
-        )
-
-        if len(items) == 0:
-            print(f"Did not find any items in {dirname}")
-            return None
-
-        if len(items) > GLOB_SIZE_THRESHOLD_CHECK and not skip_size_warning:
-            raise Exception(
-                f"Found over {GLOB_SIZE_THRESHOLD_CHECK} items in {dirname}. If this is intended, set skip_size_warning=True when calling this function."
-            )
-
         dataset = self.create_dataset(
             name=dataset_name, use_privacy_mode=use_privacy_mode
         )
-        dataset.append(items, asynchronous=False)
+        dataset.add_items_from_dir(
+            existing_dirname=existing_dirname,
+            privacy_mode_proxy=privacy_mode_proxy,
+            allowed_file_types=allowed_file_types,
+            skip_size_warning=skip_size_warning,
+        )
         return dataset
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -26,6 +26,7 @@
 from nucleus.url_utils import sanitize_string_args
 from nucleus.utils import (
     convert_export_payload,
+    create_items_from_folder_crawl,
     format_dataset_item_response,
     format_prediction_response,
     format_scale_task_info_response,
@@ -50,6 +51,7 @@
     EXPORT_FOR_TRAINING_KEY,
     EXPORTED_ROWS,
     FRAME_RATE_KEY,
+    GLOB_SIZE_THRESHOLD_CHECK,
     ITEM_KEY,
     ITEMS_KEY,
     JOB_REQ_LIMIT,
@@ -2241,3 +2243,55 @@ def jobs(
         if stats_only:
             return jobs_status_overview(job_objects)
         return job_objects
+
+    def add_items_from_dir(
+        self,
+        dirname: Optional[str] = None,
+        existing_dirname: Optional[str] = None,
+        privacy_mode_proxy: str = "",
+        allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"),
+        skip_size_warning: bool = False,
+        update_items: bool = False,
+    ):
+        """
+        Update dataset by recursively crawling through a directory.
+        A DatasetItem will be created for each unique image found.
+        The existing items are skipped or updated depending on update_items param
+
+        Args:
+            dirname: Where to look for image files, recursively
+            existing_dirname: Already validated dirname
+            privacy_mode_proxy: Endpoint that serves image files for privacy mode, ignore if not using privacy mode.
+                The proxy should work based on the relative path of the images in the directory.
+            allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png')
+            skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data.
+            update_items: Whether to update items in existing dataset
+        """
+        # fetch dataset use_privacy_mode for existence check
+        if self.use_privacy_mode:
+            assert (
+                privacy_mode_proxy
+            ), "When using privacy mode, must specify a proxy to serve the files"
+        if not existing_dirname:
+            # ensures path ends with a slash
+            existing_dirname = self._client.valid_dirname(dirname)
+        items = create_items_from_folder_crawl(
+            existing_dirname,
+            allowed_file_types,
+            self.use_privacy_mode,
+            privacy_mode_proxy,
+        )
+
+        if len(items) > 0:
+            if (
+                len(items) > GLOB_SIZE_THRESHOLD_CHECK
+                and not skip_size_warning
+            ):
+                raise Exception(
+                    f"Found over {GLOB_SIZE_THRESHOLD_CHECK} items in {dirname}. If this is intended,"
+                    f" set skip_size_warning=True when calling this function."
+                )
+            self.append(items, asynchronous=False, update=update_items)
+
+        else:
+            print(f"Did not find any items in {dirname}.")
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"]  # Easy ignore for getting it running
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.16.18"
+version = "0.17.0"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/tests/helpers.py b/tests/helpers.py
@@ -459,6 +459,7 @@ def reference_id_from_url(url):
 
 this_dir = os.path.dirname(os.path.realpath(__file__))
 TEST_LOCAL_MASK_URL = os.path.join(this_dir, "testdata/000000000285.png")
+TEST_LOCAL_TESTDIR = os.path.join(this_dir, "testdata/testdir")
 
 
 NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET = len(TEST_DATASET_ITEMS)
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -1,5 +1,7 @@
 import copy
+import glob
 import math
+import os
 
 import pytest
 
@@ -38,6 +40,7 @@
     TEST_DATASET_NAME,
     TEST_IMG_URLS,
     TEST_LIDAR_SCENES,
+    TEST_LOCAL_TESTDIR,
     TEST_MULTICATEGORY_ANNOTATIONS,
     TEST_POLYGON_ANNOTATIONS,
     TEST_SEGMENTATION_ANNOTATIONS,
@@ -611,3 +614,35 @@ def test_query(CLIENT):
     with pytest.raises(NucleusAPIError):
         for qi in dataset.query_items("annotations.count bad syntax"):
             print(qi)  # unreachable, just need to yield an item from generator
+
+
+@pytest.mark.integration
+def test_create_update_dataset_from_dir(CLIENT):
+    reference_ids = set()
+    for file_type in ["png", "jpeg"]:
+        pathname = os.path.join(TEST_LOCAL_TESTDIR, f"**/*.{file_type}")
+        reference_ids.update(
+            path.replace(TEST_LOCAL_TESTDIR + "/", "")
+            for path in glob.glob(pathname=pathname, recursive=True)
+        )
+    dataset = CLIENT.create_dataset_from_dir(
+        TEST_LOCAL_TESTDIR, allowed_file_types=tuple(["exe"])
+    )
+    assert dataset is not None
+    CLIENT.delete_dataset(dataset.id)
+    dataset = CLIENT.create_dataset_from_dir(
+        TEST_LOCAL_TESTDIR, allowed_file_types=tuple(["png"])
+    )
+    dataset_items = dataset.items
+    assert len(dataset_items) == 1
+    assert dataset_items[0].reference_id in reference_ids
+    dataset.add_items_from_dir(
+        dirname=TEST_LOCAL_TESTDIR,
+        allowed_file_types=tuple(["png", "jpeg"]),
+    )
+    dataset_items = dataset.items
+    assert len(dataset_items) == 2
+    for dataset_item in dataset_items:
+        assert dataset_item.reference_id in reference_ids
+        reference_ids.remove(dataset_item.reference_id)
+    CLIENT.delete_dataset(dataset.id)
diff --git a/tests/testdata/testdir/000000000285.png b/tests/testdata/testdir/000000000285.png
diff --git a/tests/testdata/testdir/airplane.jpeg b/tests/testdata/testdir/airplane.jpeg