fixup

ray-chew · ray-chew · commit ffb4ae7a8cec · 2025-10-25T01:47:00.000-07:00
diff --git a/pycsa/core/io.py b/pycsa/core/io.py
@@ -647,11 +647,36 @@ def _get_cached_file(self, filepath):
             """
             Get a cached NetCDF file handle, or open and cache if not already open.
             This dramatically speeds up parallel processing by avoiding repeated file opens.
+
+            Uses diskless=True to avoid HDF5 file locking issues in parallel/distributed environments.
             """
             if filepath not in self.file_cache:
                 if self.verbose:
                     print(f"Opening and caching: {filepath}")
-                self.file_cache[filepath] = nc.Dataset(filepath, "r")
+
+                import time
+                max_retries = 3
+                retry_delay = 0.5
+
+                for attempt in range(max_retries):
+                    try:
+                        # diskless=True loads file into memory, avoiding HDF5 multiprocess locking issues
+                        self.file_cache[filepath] = nc.Dataset(filepath, "r", diskless=True, persist=False)
+                        break
+                    except (OSError, RuntimeError, TypeError) as e:
+                        if attempt < max_retries - 1:
+                            # Retry with exponential backoff
+                            if self.verbose:
+                                print(f"Warning: Attempt {attempt+1} failed for {filepath}, retrying: {e}")
+                            time.sleep(retry_delay * (2 ** attempt))
+                        else:
+                            # Final attempt: try without diskless
+                            if self.verbose:
+                                print(f"Warning: diskless mode failed after {max_retries} attempts, trying normal mode: {e}")
+                            try:
+                                self.file_cache[filepath] = nc.Dataset(filepath, "r")
+                            except Exception as e2:
+                                raise RuntimeError(f"Failed to open {filepath} with both diskless and normal modes: {e2}")
             return self.file_cache[filepath]
 
         def close_cached_files(self):