fixup

ray-chew · ray-chew · commit 4020ed1de3e5 · 2025-10-25T02:34:08.000-07:00
diff --git a/runs/icon_etopo_global.py b/runs/icon_etopo_global.py
@@ -461,8 +461,10 @@ def group_cells_by_memory(clat_rad, max_memory_per_batch_gb=240.0):
             if avg_mem * len(current_batch_indices) > max_memory_per_batch_gb:
                 # Finalize current batch
                 avg_mem_current = np.mean(current_batch_memory)
-                n_workers = max(1, int(max_memory_per_batch_gb / (avg_mem_current * 1.2)))  # 20% safety margin
-                mem_per_worker = avg_mem_current * 1.2
+                # Use 30% safety margin for diskless NetCDF loading
+                safety_factor = 1.3
+                n_workers = max(1, int(max_memory_per_batch_gb / (avg_mem_current * safety_factor)))
+                mem_per_worker = avg_mem_current * safety_factor
 
                 batches.append({
                     'cell_indices': sorted(current_batch_indices),  # Sort by original index order
@@ -484,8 +486,10 @@ def group_cells_by_memory(clat_rad, max_memory_per_batch_gb=240.0):
     # Finalize last batch
     if current_batch_indices:
         avg_mem = np.mean(current_batch_memory)
-        n_workers = max(1, int(max_memory_per_batch_gb / (avg_mem * 1.2)))
-        mem_per_worker = avg_mem * 1.2
+        # Use 30% safety margin for diskless NetCDF loading
+        safety_factor = 1.3
+        n_workers = max(1, int(max_memory_per_batch_gb / (avg_mem * safety_factor)))
+        mem_per_worker = avg_mem * safety_factor
 
         batches.append({
             'cell_indices': sorted(current_batch_indices),
@@ -523,10 +527,10 @@ def parallel_wrapper(grid, params, reader, writer, chunk_output_dir, clat_rad, c
             'description': 'Generic laptop (16 threads, 16GB RAM)'
         },
         'dkrz_hpc': {
-            'total_cores': 128,
+            'total_cores': 8,
             'total_memory_gb': 240.0,
-            'netcdf_chunk_size': 1000,
-            'memory_per_cpu_mb': 1940,  # SLURM quota on interactive partition
+            'netcdf_chunk_size': 100,
+            'memory_per_cpu_mb': None,  # SLURM quota on interactive partition
             'description': 'DKRZ HPC interactive partition (standard memory node)'
         },
         'laptop_performance': {
@@ -722,11 +726,9 @@ def parallel_wrapper(grid, params, reader, writer, chunk_output_dir, clat_rad, c
 
                 # Calculate threads per worker based on configuration
                 if config['memory_per_cpu_mb'] is not None:
-                    # HPC mode: Use SLURM's memory-per-CPU quota
-                    # Each worker gets CPUs proportional to its memory allocation
-                    threads_per_worker = max(1, int(
-                        batch_config['memory_per_worker_gb'] * 1000 / config['memory_per_cpu_mb']
-                    ))
+                    # HPC mode: Distribute total cores evenly across workers
+                    # (Don't use memory_per_cpu_mb to calculate threads - it's unreliable)
+                    threads_per_worker = max(1, config['total_cores'] // n_workers)
                 else:
                     # Laptop mode: Calculate based on total available resources
                     # How many workers can we fit given memory constraints?