Skip to content

Commit 4020ed1

Browse files
committed
fixup
1 parent ffb4ae7 commit 4020ed1

File tree

1 file changed

+14
-12
lines changed

1 file changed

+14
-12
lines changed

runs/icon_etopo_global.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -461,8 +461,10 @@ def group_cells_by_memory(clat_rad, max_memory_per_batch_gb=240.0):
461461
if avg_mem * len(current_batch_indices) > max_memory_per_batch_gb:
462462
# Finalize current batch
463463
avg_mem_current = np.mean(current_batch_memory)
464-
n_workers = max(1, int(max_memory_per_batch_gb / (avg_mem_current * 1.2))) # 20% safety margin
465-
mem_per_worker = avg_mem_current * 1.2
464+
# Use 30% safety margin for diskless NetCDF loading
465+
safety_factor = 1.3
466+
n_workers = max(1, int(max_memory_per_batch_gb / (avg_mem_current * safety_factor)))
467+
mem_per_worker = avg_mem_current * safety_factor
466468

467469
batches.append({
468470
'cell_indices': sorted(current_batch_indices), # Sort by original index order
@@ -484,8 +486,10 @@ def group_cells_by_memory(clat_rad, max_memory_per_batch_gb=240.0):
484486
# Finalize last batch
485487
if current_batch_indices:
486488
avg_mem = np.mean(current_batch_memory)
487-
n_workers = max(1, int(max_memory_per_batch_gb / (avg_mem * 1.2)))
488-
mem_per_worker = avg_mem * 1.2
489+
# Use 30% safety margin for diskless NetCDF loading
490+
safety_factor = 1.3
491+
n_workers = max(1, int(max_memory_per_batch_gb / (avg_mem * safety_factor)))
492+
mem_per_worker = avg_mem * safety_factor
489493

490494
batches.append({
491495
'cell_indices': sorted(current_batch_indices),
@@ -523,10 +527,10 @@ def parallel_wrapper(grid, params, reader, writer, chunk_output_dir, clat_rad, c
523527
'description': 'Generic laptop (16 threads, 16GB RAM)'
524528
},
525529
'dkrz_hpc': {
526-
'total_cores': 128,
530+
'total_cores': 8,
527531
'total_memory_gb': 240.0,
528-
'netcdf_chunk_size': 1000,
529-
'memory_per_cpu_mb': 1940, # SLURM quota on interactive partition
532+
'netcdf_chunk_size': 100,
533+
'memory_per_cpu_mb': None, # SLURM quota on interactive partition
530534
'description': 'DKRZ HPC interactive partition (standard memory node)'
531535
},
532536
'laptop_performance': {
@@ -722,11 +726,9 @@ def parallel_wrapper(grid, params, reader, writer, chunk_output_dir, clat_rad, c
722726

723727
# Calculate threads per worker based on configuration
724728
if config['memory_per_cpu_mb'] is not None:
725-
# HPC mode: Use SLURM's memory-per-CPU quota
726-
# Each worker gets CPUs proportional to its memory allocation
727-
threads_per_worker = max(1, int(
728-
batch_config['memory_per_worker_gb'] * 1000 / config['memory_per_cpu_mb']
729-
))
729+
# HPC mode: Distribute total cores evenly across workers
730+
# (Don't use memory_per_cpu_mb to calculate threads - it's unreliable)
731+
threads_per_worker = max(1, config['total_cores'] // n_workers)
730732
else:
731733
# Laptop mode: Calculate based on total available resources
732734
# How many workers can we fit given memory constraints?

0 commit comments

Comments
 (0)