codalab · mergify · Apr 4, 2022 · Oct 26, 2021 · Oct 26, 2021 · Nov 8, 2021
diff --git a/codalab/worker/dependency_manager.py b/codalab/worker/dependency_manager.py
diff --git a/codalab/worker/main.py b/codalab/worker/main.py
@@ -304,7 +304,9 @@ def main():
     worker = Worker(
         image_manager,
         dependency_manager,
-        os.path.join(args.work_dir, 'worker-state.json'),
+        # Include the worker ID in the worker state JSON path, so multiple workers
+        # sharing the same work directory maintain their own state.
+        os.path.join(args.work_dir, f'worker-state-{args.id}.json'),
         args.cpuset,
         args.gpuset,
         args.max_memory,

diff --git a/codalab/worker/state_committer.py b/codalab/worker/state_committer.py
@@ -2,6 +2,7 @@
 import os
 import tempfile
 import shutil
+
 from . import pyjson
 
 
@@ -20,30 +21,44 @@ def commit(self, state):
 
 class JsonStateCommitter(BaseStateCommitter):
     def __init__(self, json_path):
-        self.temp_file = None
         self._state_file = json_path
 
+    @property
+    def path(self):
+        return self._state_file
+
+    @property
+    def state_file_exists(self) -> bool:
+        return os.path.isfile(self._state_file)
+
     def load(self, default=None):
+        """
+        Loads and reads from state file. If an error occurs, `default` will be returned, if it exists.
+        """
         try:
             with open(self._state_file) as json_data:
                 return pyjson.load(json_data)
-        except (ValueError, EnvironmentError):
-            return dict() if default is None else default
+        except (ValueError, EnvironmentError) as e:
+            if default is not None:
+                logger.warning(
+                    f"Failed to load state from {self.path} due to {e}. Returning default: {default}.",
+                    exc_info=True,
+                )
+                return default
+            logger.error(f"Failed to load state from {self.path}: {e}", exc_info=True)
+            raise e
 
     def commit(self, state):
         """ Write out the state in JSON format to a temporary file and rename it into place """
         with tempfile.NamedTemporaryFile(delete=False) as f:
             try:
-                self.temp_file = f.name
                 f.write(pyjson.dumps(state).encode())
                 f.flush()
-                shutil.copyfile(self.temp_file, self._state_file)
+                shutil.copyfile(f.name, self._state_file)
             finally:
                 try:
-                    os.unlink(self.temp_file)
+                    os.unlink(f.name)
                 except FileNotFoundError:
                     logger.error(
-                        "Problem occurred in deleting temp file {} via os.unlink".format(
-                            self.temp_file
-                        )
+                        "Problem occurred in deleting temp file {} via os.unlink".format(f.name)
                     )
diff --git a/codalab/worker/worker.py b/codalab/worker/worker.py
@@ -194,7 +194,8 @@ def save_state(self):
         self.state_committer.commit(runs)
 
     def load_state(self):
-        runs = self.state_committer.load()
+        # If the state file doesn't exist yet, have the state committer return an empty state.
+        runs = self.state_committer.load(default=dict())
         # Retrieve the complex container objects from the Docker API
         for uuid, run_state in runs.items():
             if run_state.container_id:

diff --git a/codalab/worker/worker_run_state.py b/codalab/worker/worker_run_state.py
@@ -5,6 +5,7 @@
 import threading
 import time
 import traceback
+from typing import Dict
 
 import codalab.worker.docker_utils as docker_utils
 
@@ -251,13 +252,26 @@ def mount_dependency(dependency, shared_file_system):
 
         dependencies_ready = True
         status_messages = []
+        dependency_keys_to_paths: Dict[DependencyKey, str] = dict()
 
         if not self.shared_file_system:
             # No need to download dependencies if we're in the shared FS,
             # since they're already in our FS
             for dep in run_state.bundle.dependencies:
                 dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
-                dependency_state = self.dependency_manager.get(run_state.bundle.uuid, dep_key)
+
+                try:
+                    # Fetching dependencies from the Dependency Manager can fail.
+                    # Just update the download status on the next iteration of this transition function.
+                    dependency_state = self.dependency_manager.get(run_state.bundle.uuid, dep_key)
+                    dependency_keys_to_paths[dep_key] = os.path.join(
+                        self.dependency_manager.dependencies_dir, dependency_state.path
+                    )
+                except Exception:
+                    status_messages.append(f'Downloading dependency {dep.child_path} failed')
+                    dependencies_ready = False
+                    continue
+
                 if dependency_state.stage == DependencyStage.DOWNLOADING:
                     status_messages.append(
                         'Downloading dependency %s: %s done (archived size)'
@@ -311,7 +325,7 @@ def mount_dependency(dependency, shared_file_system):
                 if run_state.bundle_dir_wait_num_tries == 0:
                     message = (
                         "Bundle directory cannot be found on the shared filesystem. "
-                        "Please ensure the shared fileystem between the server and "
+                        "Please ensure the shared filesystem between the server and "
                         "your worker is mounted properly or contact your administrators."
                     )
                     log_bundle_transition(
@@ -344,7 +358,13 @@ def mount_dependency(dependency, shared_file_system):
         for dep in run_state.bundle.dependencies:
             full_child_path = os.path.normpath(os.path.join(run_state.bundle_path, dep.child_path))
             to_mount = []
-            dependency_path = self._get_dependency_path(run_state, dep)
+            if self.shared_file_system:
+                # TODO(Ashwin): make this not fs-specific.
+                # On a shared FS, we know where the dependency is stored and can get the contents directly
+                dependency_path = os.path.realpath(os.path.join(dep.location, dep.parent_path))
+            else:
+                dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
+                dependency_path = dependency_keys_to_paths[dep_key]
 
             if dep.child_path == RunStateMachine._CURRENT_DIRECTORY:
                 # Mount all the content of the dependency_path to the top-level of the bundle
@@ -441,19 +461,6 @@ def mount_dependency(dependency, shared_file_system):
             gpuset=gpuset,
         )
 
-    def _get_dependency_path(self, run_state, dependency):
-        if self.shared_file_system:
-            # TODO(Ashwin): make this not fs-specific.
-            # On a shared FS, we know where the dependency is stored and can get the contents directly
-            return os.path.realpath(os.path.join(dependency.location, dependency.parent_path))
-        else:
-            # On a dependency_manager setup, ask the manager where the dependency is
-            dep_key = DependencyKey(dependency.parent_uuid, dependency.parent_path)
-            return os.path.join(
-                self.dependency_manager.dependencies_dir,
-                self.dependency_manager.get(run_state.bundle.uuid, dep_key).path,
-            )
-
     def _transition_from_RUNNING(self, run_state):
         """
         1- Check run status of the docker container
@@ -608,10 +615,19 @@ def remove_path_no_fail(path):
                     logger.error(traceback.format_exc())
                     time.sleep(1)
 
-        for dep in run_state.bundle.dependencies:
-            if not self.shared_file_system:  # No dependencies if shared fs worker
-                dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
-                self.dependency_manager.release(run_state.bundle.uuid, dep_key)
+        try:
+            # Fetching dependencies from the Dependency Manager can fail.
+            # Finish cleaning up on the next iteration of this transition function.
+            for dep in run_state.bundle.dependencies:
+                if not self.shared_file_system:  # No dependencies if shared fs worker
+                    dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
+                    self.dependency_manager.release(run_state.bundle.uuid, dep_key)
+        except (ValueError, EnvironmentError):
+            # Do nothing if an error is thrown while reading from the state file
+            logging.exception(
+                f"Error reading from dependencies state file while releasing a dependency from {run_state.bundle.uuid}"
+            )
+            return run_state
 
         # Clean up dependencies paths
         for path in run_state.paths_to_remove or []:

diff --git a/codalab/worker_manager/slurm_batch_worker_manager.py b/codalab/worker_manager/slurm_batch_worker_manager.py
@@ -268,7 +268,7 @@ def setup_codalab_worker(self, worker_id):
             work_dir_prefix = Path()
 
         worker_work_dir = work_dir_prefix.joinpath(
-            Path('{}-codalab-SlurmBatchWorkerManager-scratch'.format(self.username), worker_id)
+            Path('{}-codalab-SlurmBatchWorkerManager-scratch'.format(self.username), "workdir")
         )
         command = self.build_command(worker_id, str(worker_work_dir))
 

diff --git a/docs/Worker-Managers.md b/docs/Worker-Managers.md
@@ -12,6 +12,17 @@ We support the following Worker Managers:
 | slurm-batch      | Worker manager for submitting jobs using Slurm Batch.        | 
 | kubernetes       | Worker manager for submitting jobs to a Kubernetes cluster.  |
 
+## Setting a shared cache
+
+To use a shared cache among workers, have all the workers use the same working directory by specifying
+the same path for `--work-dir`. The working directory is set by `-worker-work-dir-prefix` when starting 
+a worker manager. The dependency managers can be used over NFS, so a working directory can be on a network disk.
+
+```commandline
+cl-worker-manager --worker-work-dir-prefix /juice  slurm-batch --cpus 4 --gpus 1 --memory-mb 16000 
+```
+
+In the example worker manager command above, `juice` is a directory on a network disk.
 
 ## AWS Batch Worker Manager
 

diff --git a/requirements.txt b/requirements.txt
@@ -28,3 +28,4 @@ wheel==0.35.1
 urllib3==1.26.5
 retry==0.9.2
 spython==0.1.14
+flufl.lock==6.0
diff --git a/tests/unit/worker/dependency_manager_test.py b/tests/unit/worker/dependency_manager_test.py
@@ -0,0 +1,147 @@
+import os
+import time
+import unittest
+import shutil
+import tempfile
+from concurrent.futures import ProcessPoolExecutor
+from unittest.mock import MagicMock
+
+from codalab.worker.bundle_state import DependencyKey
+
+try:
+    from codalab.worker.dependency_manager import DependencyManager
+
+    module_failed = False
+except ImportError:
+    module_failed = True
+
+
+class DependencyManagerTest(unittest.TestCase):
+    def setUp(self):
+        if module_failed:
+            self.skipTest('Issue with ratarmountcore.')
+
+        self.work_dir = tempfile.mkdtemp()
+        self.state_path = os.path.join(self.work_dir, "dependencies-state.json")
+        self.dependency_manager = DependencyManager(
+            commit_file=self.state_path,
+            bundle_service=None,
+            worker_dir=self.work_dir,
+            max_cache_size_bytes=1024,
+            download_dependencies_max_retries=1,
+        )
+
+    def tearDown(self):
+        shutil.rmtree(self.work_dir)
+
+    def test_get_has(self):
+        dependent_uuid = "0x2"
+        dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent")
+        state = self.dependency_manager.get(dependent_uuid, dependency_key)
+        self.assertTrue(self.dependency_manager.has(dependency_key))
+        self.assertEqual(state.stage, "DOWNLOADING")
+        self.assertEqual(state.path, "0x1_parent")
+        self.assertEqual(state.dependents, {dependent_uuid})
+
+    def test_release(self):
+        dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent")
+        self.dependency_manager.get("0x2", dependency_key)
+        state = self.dependency_manager.get("0x3", dependency_key)
+        # Passing in the same dependency key with a different dependent, will just add the dependent
+        self.assertEqual(state.dependents, {"0x2", "0x3"})
+
+        # Release 0x2 as a dependent
+        self.dependency_manager.release("0x2", dependency_key)
+        with self.dependency_manager._state_lock:
+            dependencies = self.dependency_manager._fetch_dependencies()
+        state = dependencies[dependency_key]
+        self.assertEqual(state.dependents, {"0x3"})
+
+        # Release 0x3 as a dependent - should be left with no dependents
+        self.dependency_manager.release("0x3", dependency_key)
+        with self.dependency_manager._state_lock:
+            dependencies = self.dependency_manager._fetch_dependencies()
+        state = dependencies[dependency_key]
+        self.assertEqual(len(state.dependents), 0)
+
+    def test_all_dependencies(self):
+        dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent")
+        self.dependency_manager.get("0x2", dependency_key)
+        dependency_key = DependencyKey(parent_uuid="0x3", parent_path="parent2")
+        self.dependency_manager.get("0x4", dependency_key)
+        dependency_keys = self.dependency_manager.all_dependencies
+        self.assertEqual(len(dependency_keys), 2)
+
+    @unittest.skip(
+        "Flufl.lock doesn't seem to work on GHA for some reason, "
+        "even though this test passes on other machines."
+    )
+    def test_concurrency(self):
+        num_of_dependency_managers = 10
+        executor = ProcessPoolExecutor(max_workers=num_of_dependency_managers)
+
+        random_file_path = os.path.join(self.work_dir, "random_file")
+        with open(random_file_path, "wb") as f:
+            f.seek((1024 * 1024 * 1024) - 1)  # 1 GB
+            f.write(b"\0")
+
+        futures = [
+            executor.submit(task, self.work_dir, self.state_path, random_file_path)
+            for _ in range(num_of_dependency_managers)
+        ]
+        for future in futures:
+            print(future.result())
+            self.assertIsNone(future.exception())
+        executor.shutdown()
+
+
+def task(work_dir, state_path, random_file_path):
+    """
+    Runs the end-to-end workflow of the Dependency Manager.
+    Note: ProcessPoolExecutor must serialize everything before sending it to the worker,
+          so this function needs to be defined at the top-level.
+    # """
+    # Mock Bundle Service to return a random file object
+    mock_bundle_service = MagicMock()
+    mock_bundle_service.get_bundle_info = MagicMock(return_value={'type': "file"})
+    file_obj = open(random_file_path, "rb")
+    mock_bundle_service.get_bundle_contents = MagicMock(return_value=file_obj)
+
+    # Create and start a dependency manager
+    process_id = os.getpid()
+    print(f"{process_id}: Starting a DependencyManager...")
+    dependency_manager = DependencyManager(
+        commit_file=state_path,
+        bundle_service=mock_bundle_service,
+        worker_dir=work_dir,
+        max_cache_size_bytes=2048,
+        download_dependencies_max_retries=1,
+    )
+    dependency_manager.start()
+    print(f"{process_id}: Started with work directory: {work_dir}.")
+
+    # Register a run's UUID as a dependent of a parent bundle with UUID 0x1
+    dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent")
+    run_uuid = f"0x{process_id}"
+    state = dependency_manager.get(run_uuid, dependency_key)
+    assert (
+        run_uuid in state.dependents
+    ), f"{process_id}: Expected {run_uuid} as one of the dependents."
+
+    # Release the run bundle as a dependent
+    dependency_manager.release(run_uuid, dependency_key)
+    dependencies = dependency_manager._fetch_dependencies()
+    if dependency_key in dependencies:
+        state = dependencies[dependency_key]
+        print(f"{process_id}: Checking {run_uuid} in {state.dependents}")
+        assert (
+            run_uuid not in state.dependents
+        ), f"{process_id}: Dependent should not be in the list of dependents after unregistering."
+
+    # Keep the dependency manager running for some time to test the loop
+    time.sleep(30)
+
+    # Stop the Dependency Manager
+    print(f"{process_id}: Stopping DependencyManger...")
+    dependency_manager.stop()
+    print(f"{process_id}: Done.")
diff --git a/tests/unit/worker/state_committer_test.py b/tests/unit/worker/state_committer_test.py
@@ -21,7 +21,12 @@ def tearDown(self):
 
     def test_path_parsing(self):
         """ Simple test to ensure we don't mess up the state file path"""
-        self.assertEqual(self.committer._state_file, self.state_path)
+        self.assertEqual(self.committer.path, self.state_path)
+
+    def test_state_file_exists(self):
+        self.assertFalse(self.committer.state_file_exists)
+        self.committer.commit({'state': 'value'})
+        self.assertTrue(self.committer.state_file_exists)
 
     def test_commit(self):
         """Make sure state is committed correctly"""
@@ -30,7 +35,6 @@ def test_commit(self):
         self.committer.commit(test_state)
         with open(self.state_path) as f:
             self.assertEqual(test_state_json_str, f.read())
-        self.assertFalse(os.path.exists(self.committer.temp_file))
 
     def test_load(self):
         """ Make sure load loads the state file if it exists """

diff --git a/tests/unit/worker_manager/slurm_batch_worker_manager_test.py b/tests/unit/worker_manager/slurm_batch_worker_manager_test.py
@@ -40,7 +40,7 @@ def test_base_command(self):
 
         expected_command_str = (
             "cl-worker --server some_server --verbose --exit-when-idle --idle-seconds 888 "
-            "--work-dir /some/path/some_user-codalab-SlurmBatchWorkerManager-scratch/some_worker_id "
+            "--work-dir /some/path/some_user-codalab-SlurmBatchWorkerManager-scratch/workdir "
             "--id $(hostname -s)-some_worker_id --network-prefix cl_worker_some_worker_id_network --tag some_tag "
             "--group some_group --exit-after-num-runs 8 --download-dependencies-max-retries 5 "
             "--max-work-dir-size 88g --checkin-frequency-seconds 30 --shared-memory-size-gb 10 "