Clean up relationship between deployment_tar and deployment (#389)

bfineran · dbogunowicz · web-flow · commit cb3c1fed06aa · 2023-12-04T11:27:24.000-05:00
* [WIP] clean up relationship between deployment_tar and deployment

* note that deployment_directory_path can be removed

* potentially working solution

* chaotic but working commit

* tests pass

* fix typo

* adress PR comments

---------

Co-authored-by: Damian &lt;damian@neuralmagic.com&gt;
Co-authored-by: dbogunowicz &lt;97082108+dbogunowicz@users.noreply.github.com&gt;
diff --git a/src/sparsezoo/model/model.py b/src/sparsezoo/model/model.py
@@ -111,12 +111,14 @@ def __init__(self, source: str, download_path: Optional[str] = None):
         self.sample_originals: Directory = self._directory_from_files(
             files,
             directory_class=Directory,
+            allow_multiple_outputs=True,
             display_name="sample-originals",
         )
         self.sample_inputs: NumpyDirectory = self._directory_from_files(
             files,
             directory_class=NumpyDirectory,
             display_name="sample-inputs",
+            allow_multiple_outputs=True,
         )
 
         self.model_card: File = self._file_from_files(files, display_name="model.md")
@@ -134,30 +136,25 @@ def __init__(self, source: str, download_path: Optional[str] = None):
             ] = self._sample_outputs_list_to_dict(self.sample_outputs)
 
         self.sample_labels: Directory = self._directory_from_files(
-            files, directory_class=Directory, display_name="sample-labels"
-        )
-
-        self.deployment: SelectDirectory = self._directory_from_files(
             files,
-            directory_class=SelectDirectory,
-            display_name="deployment",
-            stub_params=self.stub_params,
+            directory_class=Directory,
             allow_multiple_outputs=True,
+            display_name="sample-labels",
         )
 
-        if isinstance(self.deployment, list):
-            # if there are multiple deployment directories
-            # (this may happen due to the presence of both
-            # - deployment directory
-            # - deployment.tar.gz file
-            # we need to choose one (they are identical)
-            self.deployment = self.deployment[0]
-
         self.deployment_tar: SelectDirectory = self._directory_from_files(
             files,
             directory_class=SelectDirectory,
             display_name="deployment.tar.gz",
         )
+        self.deployment: SelectDirectory = self._directory_from_files(
+            files,
+            directory_class=SelectDirectory,
+            display_name="deployment",
+            stub_params=self.stub_params,
+            allow_multiple_outputs=True,
+            tar_directory=self.deployment_tar,
+        )
 
         self.onnx_folder: Directory = self._directory_from_files(
             files,
@@ -194,6 +191,30 @@ def __init__(self, source: str, download_path: Optional[str] = None):
         # compressed file size on disk in bytes
         self.compressed_size: Optional[int] = compressed_size
 
+        # if there are multiple deployment directories
+        # (this may happen due to the presence of both e.g.:
+        # - deployment directory
+        # - deployment.tar.gz file
+        # we need to choose one (they are identical at this point)
+        self.sample_originals = (
+            self.sample_originals[0]
+            if isinstance(self.sample_originals, list)
+            else self.sample_originals
+        )
+        self.sample_inputs = (
+            self.sample_inputs[0]
+            if isinstance(self.sample_inputs, list)
+            else self.sample_inputs
+        )
+        self.sample_labels = (
+            self.sample_labels[0]
+            if isinstance(self.sample_labels, list)
+            else self.sample_labels
+        )
+        self.deployment = (
+            self.deployment[0] if isinstance(self.deployment, list) else self.deployment
+        )
+
         # sorting name of `sample_inputs` and `sample_output` files,
         # so that they have same one-to-one correspondence when we jointly
         # iterate over them
@@ -209,7 +230,6 @@ def __init__(self, source: str, download_path: Optional[str] = None):
         self._files_dictionary = {
             "training": self.training,
             "deployment": self.deployment,
-            "deployment.tar.gz": self.deployment_tar,
             "onnx_folder": self.onnx_folder,
             "logs": self.logs,
             "sample_originals": self.sample_originals,
@@ -239,20 +259,6 @@ def __init__(self, source: str, download_path: Optional[str] = None):
 
         self.integration_validator = IntegrationValidator(model=self)
 
-    @property
-    def deployment_directory_path(self) -> str:
-        """
-        :return: file path of uncompressed deployemnt directory. Both (1) downloads
-            compressed deployemnent directory if not downloaded (2) uncompresses
-            deployment directory if compressed
-        """
-        # trigger initial download if not downloaded
-        self.deployment_tar.path
-        if self.deployment_tar.is_archive:
-            self.deployment_tar.unzip()
-
-        return self.deployment.path
-
     @property
     def stub_params(self) -> Dict[str, str]:
         """
@@ -324,12 +330,6 @@ def download(
         else:
             downloads = []
             for key, file in self._files_dictionary.items():
-                if key == "deployment":
-                    # skip the download of the deployment directory
-                    # since identical files will be downloaded
-                    # in the deployment_tar
-                    _LOGGER.debug(f"Intentionally skipping downloading the file {key}")
-                    continue
                 if file is not None:
                     # save all the files to a temporary directory
                     downloads.append(self._download(file, download_path))
@@ -636,8 +636,8 @@ def _directory_from_files(
         files: List[Dict[str, Any]],
         directory_class: Union[Directory, NumpyDirectory] = Directory,
         display_name: Optional[str] = None,
-        regex: Optional[bool] = False,
-        allow_multiple_outputs: Optional[bool] = False,
+        regex: bool = False,
+        allow_multiple_outputs: bool = False,
         **kwargs: object,
     ) -> Union[List[Union[Directory, Any, None]], List[Directory], None]:
 
@@ -746,10 +746,11 @@ def _sample_outputs_list_to_dict(
                 engine_name = directory.name.split("_")[-1]
                 if engine_name.endswith(".tar.gz"):
                     engine_name = engine_name.replace(".tar.gz", "")
-                if engine_name not in ENGINES:
+                if engine_name not in ENGINES and engine_name != "sample-outputs":
                     raise ValueError(
-                        f"The name of the 'sample-outputs' directory should "
-                        f"end with an engine name (one of the {ENGINES}). "
+                        f"The name of the sample-outputs directory should be"
+                        f"`sample-outputs` or shoud start with `sample-outputs_` and "
+                        f"end with an engine name (one of the {ENGINES})."
                         f"However, the name is {directory.name}."
                     )
                 engine_to_numpydir_map[engine_name] = directory
diff --git a/src/sparsezoo/objects/directories.py b/src/sparsezoo/objects/directories.py
@@ -197,6 +197,9 @@ class SelectDirectory(Directory):
     :param parent_directory: path of the parent SelectDirectory
     :param stub_params: dictionary of zoo stub params that this directory
         was specified with
+    :param tar_directory: optional pointer to the tar_directory
+        of this directory. By default, when downloading the directory
+        in question, we should download and extract the tarball.
     """
 
     def __init__(
@@ -207,6 +210,7 @@ def __init__(
         url: Optional[str] = None,
         parent_directory: Optional[str] = None,
         stub_params: Optional[Dict[str, str]] = None,
+        tar_directory: Optional[Directory] = None,
     ):
         self._default, self._available = None, None
 
@@ -217,7 +221,7 @@ def __init__(
             url=url,
             parent_directory=parent_directory,
         )
-
+        self.tar_directory = tar_directory
         self._stub_params = stub_params or {}
         self.files_dict = self.files_to_dictionary()
 
diff --git a/src/sparsezoo/objects/directory.py b/src/sparsezoo/objects/directory.py
@@ -168,18 +168,26 @@ def download(
                     "Please make sure that `destination_path` argument is not None."
                 )
 
+        # If tar_directory is not None, then we are downloading
+        # the directory as a tar archive file
+        target_directory = (
+            self if getattr(self, "tar_directory", None) is None else self.tar_directory
+        )
+
         # Directory can represent a tar file.
-        if self.is_archive:
-            new_file_path = os.path.join(destination_path, self.name)
+        # In this case, we download the tar file and unzip it.
+        if target_directory.is_archive:
+            new_file_path = os.path.join(destination_path, target_directory.name)
             for attempt in range(retries):
                 try:
                     download_file(
-                        url_path=self.url,
+                        url_path=target_directory.url,
                         dest_path=new_file_path,
                         overwrite=overwrite,
                     )
 
-                    self._path = new_file_path
+                    target_directory._path = new_file_path
+                    target_directory.unzip()
                     return
 
                 except Exception as err:
@@ -192,13 +200,15 @@ def download(
 
         # Directory can represent a folder or directory.
         else:
-            for file in self.files:
+            for file in target_directory.files:
                 file.download(
                     destination_path=destination_path,
                 )
-                file._path = os.path.join(destination_path, self.name, file.name)
+                file._path = os.path.join(
+                    destination_path, target_directory.name, file.name
+                )
 
-        self._path = os.path.join(destination_path, self.name)
+        target_directory._path = os.path.join(destination_path, target_directory.name)
 
     def get_file(self, file_name: str) -> Optional[File]:
         """
diff --git a/src/sparsezoo/objects/file.py b/src/sparsezoo/objects/file.py
@@ -95,7 +95,7 @@ def path(self):
         elif not os.path.exists(self._path):
             self.download()
 
-        return self._path
+        return self._path or self.path
 
     @classmethod
     def from_dict(
diff --git a/tests/sparsezoo/model/test_model.py b/tests/sparsezoo/model/test_model.py
@@ -23,7 +23,6 @@
 import pytest
 
 from sparsezoo import Model
-from sparsezoo.objects.directories import SelectDirectory
 
 
 files_ic = {
@@ -184,10 +183,6 @@ def setup(self, stub, clone_sample_outputs, expected_files):
         temp_dir = tempfile.TemporaryDirectory(dir="/tmp")
         model = Model(stub, temp_dir.name)
         model.download()
-        # since downloading the `deployment` file is
-        # disabled by default, we need to do it
-        # explicitly
-        model.deployment.download()
         self._add_mock_files(temp_dir.name, clone_sample_outputs=clone_sample_outputs)
         model = Model(temp_dir.name)
 
@@ -342,49 +337,6 @@ def test_model_gz_extraction_from_local_files(stub: str):
         "imagenet/pruned-moderate",
     ],
 )
-def test_model_deployment_directory(stub):
-    temp_dir = tempfile.TemporaryDirectory(dir="/tmp")
-    expected_deployment_files = ["model.onnx"]
-
-    model = Model(stub, temp_dir.name)
-    assert model.deployment_tar.is_archive
-    # download and extract deployment tar
-    deployment_dir_path = model.deployment_directory_path
-
-    # deployment and deployment_tar should be point to the same files
-    assert deployment_dir_path == model.deployment_tar.path == model.deployment.path
-    # make sure that the model contains expected files
-    assert set(os.listdir(temp_dir.name)) == {"deployment.tar.gz", "deployment"}
-    assert (
-        os.listdir(os.path.join(temp_dir.name, "deployment"))
-        == expected_deployment_files
-    )
-
-    assert isinstance(model.deployment, SelectDirectory)
-    # TODO: this should be 1. However, the API is returning for `deployment` file type
-    # both `model.onnx` and `deployment/model.onnx`.
-    # This should probably be fixed on the API side
-    assert (
-        len(model.deployment.files) == 2
-    )  # should be == len(expected_deployment_files)
-
-    assert isinstance(model.deployment_tar, SelectDirectory)
-    assert len(model.deployment_tar.files) == len(expected_deployment_files)
-    assert not model.deployment_tar.is_archive
-
-    # test recreating the model from the local files
-    model = Model(temp_dir.name)
-
-    assert isinstance(model.deployment, SelectDirectory)
-    assert len(model.deployment.files) == len(expected_deployment_files)
-
-    assert isinstance(model.deployment_tar, SelectDirectory)
-    assert len(model.deployment_tar.files) == len(expected_deployment_files)
-    assert not model.deployment_tar.is_archive
-
-    shutil.rmtree(temp_dir.name)
-
-
 def _extraction_test_helper(model: Model):
     # download and extract model.onnx.tar.gz
     #  path should point to extracted model.onnx file
diff --git a/tests/sparsezoo/model/test_utils.py b/tests/sparsezoo/model/test_utils.py
@@ -172,15 +172,14 @@ def test_setup_model_from_paths(self, setup):
             "recipe.md",
             "model.onnx",
             "model.onnx.tar.gz",
-            "sample-inputs.tar.gz",
+            "sample-inputs",
         }
         check_extraneous_files(expected_files, temp_dir, ignore_external_data)
 
     def test_setup_model_from_objects(self, setup):
         stub, temp_dir, download_dir, ignore_external_data = setup
         model = Model(stub, download_dir.name)
         model.download()
-        model.sample_inputs.unzip()
 
         training = model.training
         deployment = model.deployment