Skip to content

Commit 48735e1

Browse files
committed
Max 500 simulations simultaneously on HPC
1 parent f1e4314 commit 48735e1

File tree

2 files changed

+26
-17
lines changed

2 files changed

+26
-17
lines changed

ensemble/ensemble.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -569,21 +569,30 @@ def calc_prediction(self, input_state=None, save_prediction=None):
569569
en_pred = [self.sim.run_fwd_sim(state, member_index) for state, member_index in
570570
tqdm(zip(list_state, list_member_index), total=len(list_state))]
571571
elif self.sim.input_dict.get('hpc', False): # Run prediction in parallel on hpc
572-
_ = [self.sim.run_fwd_sim(state, member_index, nosim=True) for state, member_index in
573-
zip(list_state, list_member_index)]
574-
# Run call_sim on the hpc
575-
job_id=self.sim.SLURM_HPC_run(self.ne, filename=self.sim.input_dict['runfile'])
576-
# Wait for the simulations to finish
577-
if job_id:
578-
self.sim.wait_for_jobs(job_id)
579-
else:
580-
print("Job submission failed. Exiting.")
581-
# Extract the results
572+
batch_size = 500 # If more than 500 ensemble members, we limit the runs to batches of 500
573+
# Split the ensemble into batches of 500
582574
en_pred = []
583-
for member_i in list_member_index:
584-
self.sim.extract_data(member_i)
585-
en_pred.append(deepcopy(self.sim.pred_data))
586-
self.sim.remove_folder(member_i)
575+
batch_en = [np.arange(start, start + batch_size) for start in
576+
np.arange(0, self.ne - batch_size, batch_size)]
577+
if len(batch_en): # if self.ne is less than batch_size
578+
batch_en.append(np.arange(batch_en[-1][-1]+1, self.ne))
579+
else:
580+
batch_en.append(np.arange(0, self.ne))
581+
for n_e in batch_en:
582+
_ = [self.sim.run_fwd_sim(state, member_index, nosim=True) for state, member_index in
583+
zip(list_state[n_e], list_member_index[n_e])]
584+
# Run call_sim on the hpc
585+
job_id=self.sim.SLURM_HPC_run(n_e, venv=sys.executable, filename=self.sim.input_dict['runfile'])
586+
# Wait for the simulations to finish
587+
if job_id:
588+
self.sim.wait_for_jobs(job_id)
589+
else:
590+
print("Job submission failed. Exiting.")
591+
# Extract the results
592+
for member_i in list_member_index[n_e]:
593+
self.sim.extract_data(member_i)
594+
en_pred.append(deepcopy(self.sim.pred_data))
595+
self.sim.remove_folder(member_i)
587596

588597
else: # Run prediction in parallel using p_map
589598
en_pred = p_map(self.sim.run_fwd_sim, list_state,

simulator/opm.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def check_sim_end(self, finished_member=None):
9595
return finished_member
9696

9797
@staticmethod
98-
def SLURM_HPC_run(num_runs, filename=None):
98+
def SLURM_HPC_run(n_e, venv, filename=None):
9999
"""
100100
HPC run manager for SLURM.
101101
@@ -106,7 +106,7 @@ def SLURM_HPC_run(num_runs, filename=None):
106106
slurm_script = f"""#!/bin/bash
107107
#SBATCH --partition=comp
108108
#SBATCH --job-name=EnDA
109-
#SBATCH --array=0-{num_runs - 1}
109+
#SBATCH --array={n_e[0]}-{n_e[-1]}
110110
#SBATCH --time=01:00:00
111111
#SBATCH --mem=4G
112112
#SBATCH --cpus-per-task=1
@@ -118,7 +118,7 @@ def SLURM_HPC_run(num_runs, filename=None):
118118
export LMOD_DISABLE_SAME_NAME_AUTOSWAP=no
119119
module load opm-simulators
120120
121-
source ../../../code/venv/bin/activate
121+
source {venv}
122122
123123
# Set folder based on SLURM_ARRAY_TASK_ID
124124
folder="En_${{SLURM_ARRAY_TASK_ID}}/"

0 commit comments

Comments
 (0)