Skip to content

Commit 5e30d0d

Browse files
committed
check remote job state and print it to console w
1 parent 689b5c9 commit 5e30d0d

File tree

1 file changed

+49
-5
lines changed

1 file changed

+49
-5
lines changed

slurm_jupyter_kernel/start_kernel.py

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,23 @@
77
import os;
88
import ast;
99
import subprocess;
10+
import time;
11+
from threading import Thread;
12+
from subprocess import check_output;
1013

1114
logging.basicConfig(level=logging.DEBUG);
1215

1316
# handle kernel as an object
1417
class remoteslurmkernel:
1518

19+
cmd_slurm_get_state = 'squeue -j {job_id} -ho "%T"';
20+
1621
def __init__ (self, kernel_cmd, connection_file, slurm_parameter, loginnode, proxyjump, srun_cmd, environment):
1722

1823
self.slurm_parameter = slurm_parameter;
1924
self.kernelcmd = kernel_cmd;
2025
self.slurm_session = None;
26+
self.job_id = None;
2127
self.connection_file = json.load(open(connection_file));
2228
self.loginnode = loginnode;
2329
self.proxyjump = False;
@@ -47,19 +53,29 @@ def start_slurm_kernel (self):
4753
proxyjump = '';
4854
if self.proxyjump:
4955
proxyjump = f'-J {self.proxyjump}';
50-
ssh_cmd = f'ssh -tA {proxyjump} {self.loginnode}';
56+
self.ssh_cmd = f'ssh -tA {proxyjump} {self.loginnode}';
57+
5158

52-
cmd = f'{ssh_cmd} /bin/bash --login -c "{self.srun_cmd}\ {cmd_args}\ -J\ {default_slurm_job_name}\ -vu\ bash\ -i"';
59+
cmd = f'{self.ssh_cmd} /bin/bash --login -c "{self.srun_cmd}\ {cmd_args}\ -J\ {default_slurm_job_name}\ -vu\ bash\ -i"';
5360

54-
logging.debug(f"Running slurm kernel command: {cmd}");
61+
logging.info(f"Running slurm kernel command: {cmd}");
5562

5663
self.slurm_session = pexpect.spawn(str(cmd), timeout=500);
57-
self.slurm_session.expect('Node (.*), .* tasks started');
5864

65+
# get slurm job id
66+
self.slurm_session.expect('srun: job (.*) queued and waiting for resources')
67+
self.job_id = self.slurm_session.match.groups()[0];
68+
self.job_id = self.job_id.decode('utf-8');
69+
logging.info(f'Slurm job id: {self.job_id}');
70+
71+
check_state_thread = Thread(target=self.check_slurm_job);
72+
check_state_thread.start();
73+
74+
self.slurm_session.expect('Node (.*), .* tasks started');
5975
# get execution node
6076
exec_node = self.slurm_session.match.groups()[0];
6177
self.exec_node = exec_node.decode('utf-8');
62-
logging.debug(f'Slurm execution node: {self.exec_node}');
78+
logging.info(f'Slurm execution node: {self.exec_node}');
6379

6480
if not self.slurm_session == None:
6581

@@ -106,6 +122,34 @@ def initialize_ssh_tunnels (self):
106122
else:
107123
logging.debug('self.exec_host is type NONE');
108124

125+
def check_slurm_job (self):
126+
127+
if not self.cmd_slurm_get_state is None:
128+
if self.job_id:
129+
self.cmd_slurm_get_state = self.cmd_slurm_get_state.format(job_id=self.job_id);
130+
#self.cmd_slurm_get_state = self.cmd_slurm_get_state.split(' ');
131+
132+
#self.cmd_slurm_get_state = self.cmd_slurm_get_state.replace(' ', '\ ');
133+
check_command = self.ssh_cmd + ' -T /bin/bash --login -c "' + self.cmd_slurm_get_state + '" 2> /dev/null';
134+
check_command = check_command.split(' ');
135+
136+
while True:
137+
time.sleep(2);
138+
state = check_output(check_command);
139+
state = state.decode('utf-8');
140+
if 'PENDING' in state:
141+
logging.info('Jupyter Slurm job is in state PENDING');
142+
continue;
143+
elif 'RUNNING' in state:
144+
logging.info('Jupyter Slurm job is now in state RUNNING! Waiting for kernel to start...');
145+
break;
146+
else:
147+
logging.error("Jupyter Slurm job is neither PENDING nor RUNNING?");
148+
continue;
149+
150+
else:
151+
raise NotImplementedError('Specify remoteslurmkernel.cmd_slurm_get_state to fetch slurm job state!');
152+
109153
def kernel_state (self):
110154
while True:
111155
if not self.slurm_session.isalive():

0 commit comments

Comments
 (0)