Skip to content

Commit

Permalink
Detect and clear out-of-memory jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
zagganas committed Feb 10, 2021
1 parent 9ff4630 commit 75d9198
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 14 deletions.
15 changes: 9 additions & 6 deletions models/Software.php
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,7 @@ public function createAndRunJob($commands, $fields,
// print_r($statsCommand);
// exit(0);
shell_exec(sprintf('%s > /dev/null 2>&1 &', $statsCommand));
// shell_exec(sprintf('%s > /data/zagganas/testlogs.txt 2>&1 &', $statsCommand));


/*
Expand Down Expand Up @@ -955,27 +956,29 @@ public function createAndRunJob($commands, $fields,
unset($output);


$podString=(explode('/',(explode(' ',$podString))[0]))[1];

// $podString=(explode('/',(explode(' ',$podString))[0]))[1];

exec('sudo -u ' . Yii::$app->params['systemUser'] . ' kubectl get pods --no-headers 2>&1',$output,$ret);
exec('sudo -u ' . Yii::$app->params['systemUser'] . " kubectl get pods --no-headers 2>&1 | grep $jobName | tr -s ' ' ",$output,$ret);


foreach ($output as $out)
{
// print_r($out);
// print_r("<br />");
if (strpos($out,$podString)!== false)
if (strpos($out,$jobName)!== false)
{
$podString=preg_split('/[\s]+/', $out)[0];
$podString=explode(' ', $out)[0];
break;
}

}

// print_r($podString);
// exit(0);

// exec('sudo -u ' . Yii::$app->params['systemUser'] . " kubectl get pods --no-headers -l job-name=$jobName 2>&1 | tr -s ' ' ",$output,$ret);
// print_r($output);
// exit(0);
// $podString=explode(' ',$output)[0];

return [$podString,'', $machineType];

Expand Down
22 changes: 18 additions & 4 deletions scheduler_files/jobMonitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
status_code=0
cpu=0
memory=0
while (status!='Completed') and (status!='Error') and (status!='ErrImagePullBackOff') and (status!="ContainerCannotRun") and (status!="RunContainerError"):
while (status!='Completed') and (status!='Error') and (status!='ErrImagePullBackOff') and (status!="ContainerCannotRun") and (status!="RunContainerError") and (status!="OOMKilled"):

code=0
command="kubectl top pod --no-headers " + podid
Expand Down Expand Up @@ -79,8 +79,18 @@
status='Canceled'
break
#If it is running, get the status
out=out.split(' ')
status=out[2]
out=out.strip().split('\n')
if len(out)>1:
for line in out:
line=line.split(' ')
status=line[2]
print(out)
if status=='OOMKilled':
break
else:
out=out[0]
out=out.split(' ')
status=out[2]

if status!='Canceled':
#Get start and end times
Expand Down Expand Up @@ -113,6 +123,10 @@
if status=='Complete':
query="UPDATE run_history SET ram=" + str(memory) + ", cpu=" + str(cpu) + ", start='" + start +"', stop='" + str(end) + "', status='" + status +"' WHERE jobid='" + jobid + "'"
status_code=0

elif status=='OOMKilled':
query="UPDATE run_history SET stop='NOW()', status='Out_of_RAM', remote_status_code=-10 WHERE jobid='" + jobid + "'"
status_code=-10
else:
query="UPDATE run_history SET stop='NOW()', status='Error', remote_status_code=-9 WHERE jobid='" + jobid + "'"
status_code=-2
Expand All @@ -122,7 +136,7 @@

conn.close()

if status!='Canceled':
if (status!='Canceled'):
#Get logs
command="kubectl get pods --no-headers -l job-name=" + jobName + " | tr -s ' '"
try:
Expand Down
19 changes: 16 additions & 3 deletions scheduler_files/tesJobMonitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def monitorJob(jobName,jobid):
status_code=0
cpu=0
memory=0
while (status!='Completed') and (status!='Error') and (status!='ErrImagePullBackOff') and (status!="ContainerCannotRun") and (status!="RunContainerError"):
while (status!='Completed') and (status!='Error') and (status!='ErrImagePullBackOff') and (status!="ContainerCannotRun") and (status!="RunContainerError") and (status!="OOMKilled"):

code=0
command="kubectl top pod --no-headers " + podid
Expand Down Expand Up @@ -77,8 +77,18 @@ def monitorJob(jobName,jobid):
status='Canceled'
break
#If it is running, get the status
out=out.split(' ')
status=out[2]
out=out.strip().split('\n')
if len(out)>1:
for line in out:
line=line.split(' ')
status=line[2]
print(out)
if status=='OOMKilled':
break
else:
out=out[0]
out=out.split(' ')
status=out[2]

if status!='Canceled':
#Get start and end times
Expand Down Expand Up @@ -109,6 +119,9 @@ def monitorJob(jobName,jobid):
elif status=='Complete':
query="UPDATE run_history SET ram=" + str(memory) + ", cpu=" + str(cpu) + ", start='" + start +"', stop='" + str(end) + "', status='" + status +"' WHERE jobid='" + jobid + "'"
status_code=0
elif status=='OOMKilled':
query="UPDATE run_history SET stop='NOW()', status='Out_of_RAM', remote_status_code=-11 WHERE jobid='" + jobid + "'"
status_code=-2
else:
query="UPDATE run_history SET stop='NOW()', status='Error', remote_status_code=-9 WHERE jobid='" + jobid + "'"
status_code=-2
Expand Down
2 changes: 1 addition & 1 deletion web/js/software/logs.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ $(document).ready(function()


status=$("#status-value").text();
if ( (status == "Completed") || (status == "Error") || (status == "ImagePullBackOff") || (status == "Terminating") || (status == "Canceled") )
if ( (status == "Completed") || (status == "Error") || (status == "ImagePullBackOff") || (status == "Terminating") || (status == "Canceled") || (status=="OOMKilled"))
{
clearInterval(refId);
setTimeout(cleanUp(),2000);
Expand Down

0 comments on commit 75d9198

Please sign in to comment.