|
7 | 7 | import os;
|
8 | 8 | import ast;
|
9 | 9 | import subprocess;
|
| 10 | +import time; |
| 11 | +from threading import Thread; |
| 12 | +from subprocess import check_output; |
10 | 13 |
|
11 | 14 | logging.basicConfig(level=logging.DEBUG);
|
12 | 15 |
|
13 | 16 | # handle kernel as an object
|
14 | 17 | class remoteslurmkernel:
|
15 | 18 |
|
| 19 | + cmd_slurm_get_state = 'squeue -j {job_id} -ho "%T"'; |
| 20 | + |
16 | 21 | def __init__ (self, kernel_cmd, connection_file, slurm_parameter, loginnode, proxyjump, srun_cmd, environment):
|
17 | 22 |
|
18 | 23 | self.slurm_parameter = slurm_parameter;
|
19 | 24 | self.kernelcmd = kernel_cmd;
|
20 | 25 | self.slurm_session = None;
|
| 26 | + self.job_id = None; |
21 | 27 | self.connection_file = json.load(open(connection_file));
|
22 | 28 | self.loginnode = loginnode;
|
23 | 29 | self.proxyjump = False;
|
@@ -47,19 +53,29 @@ def start_slurm_kernel (self):
|
47 | 53 | proxyjump = '';
|
48 | 54 | if self.proxyjump:
|
49 | 55 | proxyjump = f'-J {self.proxyjump}';
|
50 |
| - ssh_cmd = f'ssh -tA {proxyjump} {self.loginnode}'; |
| 56 | + self.ssh_cmd = f'ssh -tA {proxyjump} {self.loginnode}'; |
| 57 | + |
51 | 58 |
|
52 |
| - cmd = f'{ssh_cmd} /bin/bash --login -c "{self.srun_cmd}\ {cmd_args}\ -J\ {default_slurm_job_name}\ -vu\ bash\ -i"'; |
| 59 | + cmd = f'{self.ssh_cmd} /bin/bash --login -c "{self.srun_cmd}\ {cmd_args}\ -J\ {default_slurm_job_name}\ -vu\ bash\ -i"'; |
53 | 60 |
|
54 |
| - logging.debug(f"Running slurm kernel command: {cmd}"); |
| 61 | + logging.info(f"Running slurm kernel command: {cmd}"); |
55 | 62 |
|
56 | 63 | self.slurm_session = pexpect.spawn(str(cmd), timeout=500);
|
57 |
| - self.slurm_session.expect('Node (.*), .* tasks started'); |
58 | 64 |
|
| 65 | + # get slurm job id |
| 66 | + self.slurm_session.expect('srun: job (.*) queued and waiting for resources') |
| 67 | + self.job_id = self.slurm_session.match.groups()[0]; |
| 68 | + self.job_id = self.job_id.decode('utf-8'); |
| 69 | + logging.info(f'Slurm job id: {self.job_id}'); |
| 70 | + |
| 71 | + check_state_thread = Thread(target=self.check_slurm_job); |
| 72 | + check_state_thread.start(); |
| 73 | + |
| 74 | + self.slurm_session.expect('Node (.*), .* tasks started'); |
59 | 75 | # get execution node
|
60 | 76 | exec_node = self.slurm_session.match.groups()[0];
|
61 | 77 | self.exec_node = exec_node.decode('utf-8');
|
62 |
| - logging.debug(f'Slurm execution node: {self.exec_node}'); |
| 78 | + logging.info(f'Slurm execution node: {self.exec_node}'); |
63 | 79 |
|
64 | 80 | if not self.slurm_session == None:
|
65 | 81 |
|
@@ -106,6 +122,34 @@ def initialize_ssh_tunnels (self):
|
106 | 122 | else:
|
107 | 123 | logging.debug('self.exec_host is type NONE');
|
108 | 124 |
|
| 125 | + def check_slurm_job (self): |
| 126 | + |
| 127 | + if not self.cmd_slurm_get_state is None: |
| 128 | + if self.job_id: |
| 129 | + self.cmd_slurm_get_state = self.cmd_slurm_get_state.format(job_id=self.job_id); |
| 130 | + #self.cmd_slurm_get_state = self.cmd_slurm_get_state.split(' '); |
| 131 | + |
| 132 | + #self.cmd_slurm_get_state = self.cmd_slurm_get_state.replace(' ', '\ '); |
| 133 | + check_command = self.ssh_cmd + ' -T /bin/bash --login -c "' + self.cmd_slurm_get_state + '" 2> /dev/null'; |
| 134 | + check_command = check_command.split(' '); |
| 135 | + |
| 136 | + while True: |
| 137 | + time.sleep(2); |
| 138 | + state = check_output(check_command); |
| 139 | + state = state.decode('utf-8'); |
| 140 | + if 'PENDING' in state: |
| 141 | + logging.info('Jupyter Slurm job is in state PENDING'); |
| 142 | + continue; |
| 143 | + elif 'RUNNING' in state: |
| 144 | + logging.info('Jupyter Slurm job is now in state RUNNING! Waiting for kernel to start...'); |
| 145 | + break; |
| 146 | + else: |
| 147 | + logging.error("Jupyter Slurm job is neither PENDING nor RUNNING?"); |
| 148 | + continue; |
| 149 | + |
| 150 | + else: |
| 151 | + raise NotImplementedError('Specify remoteslurmkernel.cmd_slurm_get_state to fetch slurm job state!'); |
| 152 | + |
109 | 153 | def kernel_state (self):
|
110 | 154 | while True:
|
111 | 155 | if not self.slurm_session.isalive():
|
|
0 commit comments