@@ -647,11 +647,36 @@ def _get_cached_file(self, filepath):
647647 """
648648 Get a cached NetCDF file handle, or open and cache if not already open.
649649 This dramatically speeds up parallel processing by avoiding repeated file opens.
650+
651+ Uses diskless=True to avoid HDF5 file locking issues in parallel/distributed environments.
650652 """
651653 if filepath not in self .file_cache :
652654 if self .verbose :
653655 print (f"Opening and caching: { filepath } " )
654- self .file_cache [filepath ] = nc .Dataset (filepath , "r" )
656+
657+ import time
658+ max_retries = 3
659+ retry_delay = 0.5
660+
661+ for attempt in range (max_retries ):
662+ try :
663+ # diskless=True loads file into memory, avoiding HDF5 multiprocess locking issues
664+ self .file_cache [filepath ] = nc .Dataset (filepath , "r" , diskless = True , persist = False )
665+ break
666+ except (OSError , RuntimeError , TypeError ) as e :
667+ if attempt < max_retries - 1 :
668+ # Retry with exponential backoff
669+ if self .verbose :
670+ print (f"Warning: Attempt { attempt + 1 } failed for { filepath } , retrying: { e } " )
671+ time .sleep (retry_delay * (2 ** attempt ))
672+ else :
673+ # Final attempt: try without diskless
674+ if self .verbose :
675+ print (f"Warning: diskless mode failed after { max_retries } attempts, trying normal mode: { e } " )
676+ try :
677+ self .file_cache [filepath ] = nc .Dataset (filepath , "r" )
678+ except Exception as e2 :
679+ raise RuntimeError (f"Failed to open { filepath } with both diskless and normal modes: { e2 } " )
655680 return self .file_cache [filepath ]
656681
657682 def close_cached_files (self ):
0 commit comments