Skip to content

Get data from cache #525

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 25 additions & 9 deletions executorlib/standalone/hdf.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
from typing import Optional, Tuple
import os
from typing import List, Optional, Tuple

import cloudpickle
import h5py
import numpy as np

group_dict = {
"fn": "function",
"args": "input_args",
"kwargs": "input_kwargs",
"output": "output",
"runtime": "runtime",
"queue_id": "queue_id",
}


def dump(file_name: str, data_dict: dict) -> None:
"""
Expand All @@ -13,14 +23,6 @@ def dump(file_name: str, data_dict: dict) -> None:
file_name (str): file name of the HDF5 file as absolute path
data_dict (dict): dictionary containing the python function to be executed {"fn": ..., "args": (), "kwargs": {}}
"""
group_dict = {
"fn": "function",
"args": "input_args",
"kwargs": "input_kwargs",
"output": "output",
"runtime": "runtime",
"queue_id": "queue_id",
}
with h5py.File(file_name, "a") as fname:
for data_key, data_value in data_dict.items():
if data_key in group_dict.keys():
Expand Down Expand Up @@ -97,3 +99,17 @@ def get_queue_id(file_name: str) -> Optional[int]:
return cloudpickle.loads(np.void(hdf["/queue_id"]))
else:
return None


def get_cache_data(cache_directory: str) -> List[dict]:
file_lst = []
for file_name in os.listdir(cache_directory):
with h5py.File(os.path.join(cache_directory, file_name), "r") as hdf:
file_content_dict = {
key: cloudpickle.loads(np.void(hdf["/" + key]))
for key in group_dict.values()
if key in hdf
}
file_content_dict["filename"] = file_name
file_lst.append(file_content_dict)
return file_lst
33 changes: 33 additions & 0 deletions tests/test_cache_executor_interactive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import shutil
import unittest

from executorlib import Executor

try:
from executorlib.standalone.hdf import get_cache_data

skip_h5py_test = False
except ImportError:
skip_h5py_test = True


@unittest.skipIf(
skip_h5py_test, "h5py is not installed, so the h5io tests are skipped."
)
class TestCacheFunctions(unittest.TestCase):
def test_cache_data(self):
cache_directory = "./cache"
with Executor(backend="local", cache_directory=cache_directory) as exe:
future_lst = [exe.submit(sum, [i, i]) for i in range(1, 4)]
result_lst = [f.result() for f in future_lst]

cache_lst = get_cache_data(cache_directory=cache_directory)
self.assertEqual(sum([c["output"] for c in cache_lst]), sum(result_lst))
self.assertEqual(
sum([sum(c["input_args"][0]) for c in cache_lst]), sum(result_lst)
)
Comment on lines +19 to +29
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Enhance test coverage and use temporary directory

The test could be improved in several ways:

  1. Use tempfile.mkdtemp() instead of hardcoded "./cache"
  2. Add test cases for error conditions
  3. Validate all cached fields, not just output and input_args

Consider this improved implementation:

+import tempfile
+
 def test_cache_data(self):
-    cache_directory = "./cache"
+    cache_directory = tempfile.mkdtemp()
     with Executor(backend="local", cache_directory=cache_directory) as exe:
         future_lst = [exe.submit(sum, [i, i]) for i in range(1, 4)]
         result_lst = [f.result() for f in future_lst]

     cache_lst = get_cache_data(cache_directory=cache_directory)
+    # Validate cache size
+    self.assertEqual(len(cache_lst), len(future_lst))
+    
+    # Validate all cached fields
+    for cache_entry in cache_lst:
+        self.assertIn('function', cache_entry)
+        self.assertIn('input_args', cache_entry)
+        self.assertIn('input_kwargs', cache_entry)
+        self.assertIn('output', cache_entry)
+        self.assertIn('runtime', cache_entry)
+
     self.assertEqual(sum([c["output"] for c in cache_lst]), sum(result_lst))
     self.assertEqual(
         sum([sum(c["input_args"][0]) for c in cache_lst]), sum(result_lst)
     )
+
+def test_cache_data_invalid_directory(self):
+    with self.assertRaises(ValueError):
+        get_cache_data("/nonexistent/path")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def test_cache_data(self):
cache_directory = "./cache"
with Executor(backend="local", cache_directory=cache_directory) as exe:
future_lst = [exe.submit(sum, [i, i]) for i in range(1, 4)]
result_lst = [f.result() for f in future_lst]
cache_lst = get_cache_data(cache_directory=cache_directory)
self.assertEqual(sum([c["output"] for c in cache_lst]), sum(result_lst))
self.assertEqual(
sum([sum(c["input_args"][0]) for c in cache_lst]), sum(result_lst)
)
def test_cache_data(self):
cache_directory = tempfile.mkdtemp()
with Executor(backend="local", cache_directory=cache_directory) as exe:
future_lst = [exe.submit(sum, [i, i]) for i in range(1, 4)]
result_lst = [f.result() for f in future_lst]
cache_lst = get_cache_data(cache_directory=cache_directory)
# Validate cache size
self.assertEqual(len(cache_lst), len(future_lst))
# Validate all cached fields
for cache_entry in cache_lst:
self.assertIn('function', cache_entry)
self.assertIn('input_args', cache_entry)
self.assertIn('input_kwargs', cache_entry)
self.assertIn('output', cache_entry)
self.assertIn('runtime', cache_entry)
self.assertEqual(sum([c["output"] for c in cache_lst]), sum(result_lst))
self.assertEqual(
sum([sum(c["input_args"][0]) for c in cache_lst]), sum(result_lst)
)
def test_cache_data_invalid_directory(self):
with self.assertRaises(ValueError):
get_cache_data("/nonexistent/path")


def tearDown(self):
if os.path.exists("cache"):
shutil.rmtree("cache")
Comment on lines +31 to +33
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Update cleanup to use test's cache directory

The tearDown method should clean up the temporary directory used in the test, not a hardcoded path.

 def tearDown(self):
-    if os.path.exists("cache"):
-        shutil.rmtree("cache")
+    if hasattr(self, '_testMethodName'):
+        method = getattr(self, self._testMethodName)
+        if hasattr(method, 'cache_directory'):
+            shutil.rmtree(method.cache_directory)

Committable suggestion skipped: line range outside the PR's diff.

Loading