From 75d91989923c4a5d4c706f82e79d66e3bf330d2c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 10 Feb 2021 17:09:06 +0200 Subject: [PATCH] Detect and clear out-of-memory jobs --- models/Software.php | 15 +++++++++------ scheduler_files/jobMonitor.py | 22 ++++++++++++++++++---- scheduler_files/tesJobMonitor.py | 19 ++++++++++++++++--- web/js/software/logs.js | 2 +- 4 files changed, 44 insertions(+), 14 deletions(-) diff --git a/models/Software.php b/models/Software.php index 3cecdad..2730437 100644 --- a/models/Software.php +++ b/models/Software.php @@ -904,6 +904,7 @@ public function createAndRunJob($commands, $fields, // print_r($statsCommand); // exit(0); shell_exec(sprintf('%s > /dev/null 2>&1 &', $statsCommand)); + // shell_exec(sprintf('%s > /data/zagganas/testlogs.txt 2>&1 &', $statsCommand)); /* @@ -955,19 +956,18 @@ public function createAndRunJob($commands, $fields, unset($output); - $podString=(explode('/',(explode(' ',$podString))[0]))[1]; - + // $podString=(explode('/',(explode(' ',$podString))[0]))[1]; - exec('sudo -u ' . Yii::$app->params['systemUser'] . ' kubectl get pods --no-headers 2>&1',$output,$ret); + exec('sudo -u ' . Yii::$app->params['systemUser'] . " kubectl get pods --no-headers 2>&1 | grep $jobName | tr -s ' ' ",$output,$ret); foreach ($output as $out) { // print_r($out); // print_r("
"); - if (strpos($out,$podString)!== false) + if (strpos($out,$jobName)!== false) { - $podString=preg_split('/[\s]+/', $out)[0]; + $podString=explode(' ', $out)[0]; break; } @@ -975,7 +975,10 @@ public function createAndRunJob($commands, $fields, // print_r($podString); // exit(0); - + // exec('sudo -u ' . Yii::$app->params['systemUser'] . " kubectl get pods --no-headers -l job-name=$jobName 2>&1 | tr -s ' ' ",$output,$ret); + // print_r($output); + // exit(0); + // $podString=explode(' ',$output)[0]; return [$podString,'', $machineType]; diff --git a/scheduler_files/jobMonitor.py b/scheduler_files/jobMonitor.py index bbbe4b0..6ee9c61 100755 --- a/scheduler_files/jobMonitor.py +++ b/scheduler_files/jobMonitor.py @@ -36,7 +36,7 @@ status_code=0 cpu=0 memory=0 -while (status!='Completed') and (status!='Error') and (status!='ErrImagePullBackOff') and (status!="ContainerCannotRun") and (status!="RunContainerError"): +while (status!='Completed') and (status!='Error') and (status!='ErrImagePullBackOff') and (status!="ContainerCannotRun") and (status!="RunContainerError") and (status!="OOMKilled"): code=0 command="kubectl top pod --no-headers " + podid @@ -79,8 +79,18 @@ status='Canceled' break #If it is running, get the status - out=out.split(' ') - status=out[2] + out=out.strip().split('\n') + if len(out)>1: + for line in out: + line=line.split(' ') + status=line[2] + print(out) + if status=='OOMKilled': + break + else: + out=out[0] + out=out.split(' ') + status=out[2] if status!='Canceled': #Get start and end times @@ -113,6 +123,10 @@ if status=='Complete': query="UPDATE run_history SET ram=" + str(memory) + ", cpu=" + str(cpu) + ", start='" + start +"', stop='" + str(end) + "', status='" + status +"' WHERE jobid='" + jobid + "'" status_code=0 + +elif status=='OOMKilled': + query="UPDATE run_history SET stop='NOW()', status='Out_of_RAM', remote_status_code=-10 WHERE jobid='" + jobid + "'" + status_code=-10 else: query="UPDATE run_history SET stop='NOW()', status='Error', remote_status_code=-9 WHERE jobid='" + jobid + "'" status_code=-2 @@ -122,7 +136,7 @@ conn.close() -if status!='Canceled': +if (status!='Canceled'): #Get logs command="kubectl get pods --no-headers -l job-name=" + jobName + " | tr -s ' '" try: diff --git a/scheduler_files/tesJobMonitor.py b/scheduler_files/tesJobMonitor.py index 1593ecb..72b6b30 100644 --- a/scheduler_files/tesJobMonitor.py +++ b/scheduler_files/tesJobMonitor.py @@ -34,7 +34,7 @@ def monitorJob(jobName,jobid): status_code=0 cpu=0 memory=0 - while (status!='Completed') and (status!='Error') and (status!='ErrImagePullBackOff') and (status!="ContainerCannotRun") and (status!="RunContainerError"): + while (status!='Completed') and (status!='Error') and (status!='ErrImagePullBackOff') and (status!="ContainerCannotRun") and (status!="RunContainerError") and (status!="OOMKilled"): code=0 command="kubectl top pod --no-headers " + podid @@ -77,8 +77,18 @@ def monitorJob(jobName,jobid): status='Canceled' break #If it is running, get the status - out=out.split(' ') - status=out[2] + out=out.strip().split('\n') + if len(out)>1: + for line in out: + line=line.split(' ') + status=line[2] + print(out) + if status=='OOMKilled': + break + else: + out=out[0] + out=out.split(' ') + status=out[2] if status!='Canceled': #Get start and end times @@ -109,6 +119,9 @@ def monitorJob(jobName,jobid): elif status=='Complete': query="UPDATE run_history SET ram=" + str(memory) + ", cpu=" + str(cpu) + ", start='" + start +"', stop='" + str(end) + "', status='" + status +"' WHERE jobid='" + jobid + "'" status_code=0 + elif status=='OOMKilled': + query="UPDATE run_history SET stop='NOW()', status='Out_of_RAM', remote_status_code=-11 WHERE jobid='" + jobid + "'" + status_code=-2 else: query="UPDATE run_history SET stop='NOW()', status='Error', remote_status_code=-9 WHERE jobid='" + jobid + "'" status_code=-2 diff --git a/web/js/software/logs.js b/web/js/software/logs.js index f16a519..cd64ba4 100644 --- a/web/js/software/logs.js +++ b/web/js/software/logs.js @@ -91,7 +91,7 @@ $(document).ready(function() status=$("#status-value").text(); - if ( (status == "Completed") || (status == "Error") || (status == "ImagePullBackOff") || (status == "Terminating") || (status == "Canceled") ) + if ( (status == "Completed") || (status == "Error") || (status == "ImagePullBackOff") || (status == "Terminating") || (status == "Canceled") || (status=="OOMKilled")) { clearInterval(refId); setTimeout(cleanUp(),2000);