Skip to content

Commit 6cc8f97

Browse files
committed
Keep track of failed nodes when calculating block size and added some debugging statements for the PBS provider
1 parent abc7ce6 commit 6cc8f97

File tree

3 files changed

+24
-2
lines changed

3 files changed

+24
-2
lines changed

cogkit/modules/provider-coaster/src/org/globus/cog/abstraction/coaster/service/job/manager/Block.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ private synchronized static BlockTaskSubmitter getSubmitter() {
6565
return submitter;
6666
}
6767

68-
private int workers, activeWorkers;
68+
private int workers, activeWorkers, failedWorkers;
6969
private TimeInterval walltime, maxIdleTime;
7070
private Time endTime, startTime, deadline, creationTime, terminationTime, shutdownTime;
7171
private final SortedMap<Cpu, Time> scpus;
@@ -267,7 +267,7 @@ public double sizeLeft() {
267267
}
268268
else if (running) {
269269
return bqp.getMetric().size(
270-
workers,
270+
workers - failedWorkers,
271271
(int) TimeInterval.max(getEndTime().subtract(Time.max(Time.now(), startTime)), NO_TIME).getSeconds());
272272
}
273273
else {
@@ -661,6 +661,8 @@ public void removeNode(Node node) {
661661
int left;
662662
synchronized(cpus) {
663663
nodes.remove(node);
664+
this.activeWorkers -= node.getConcurrency();
665+
this.failedWorkers += node.getConcurrency();
664666
for (Cpu cpu : node.getCpus()) {
665667
scpus.remove(cpu);
666668
cpus.remove(cpu);

cogkit/modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/common/AbstractJobSubmissionTaskHandler.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ public synchronized void cancel(String message)
104104
}
105105

106106
public void processCompleted(int exitCode) {
107+
if (logger.isDebugEnabled()) {
108+
logger.debug("Process " + getTask().getIdentity() + " completed. Exit code: " + exitCode);
109+
}
107110
if (getTask().getStatus().getStatusCode() != Status.FAILED) {
108111
if (exitCode == 0) {
109112
getTask().setStatus(Status.COMPLETED);
@@ -115,14 +118,23 @@ public void processCompleted(int exitCode) {
115118
}
116119

117120
public void processFailed(String message) {
121+
if (logger.isDebugEnabled()) {
122+
logger.debug("Process " + getTask().getIdentity() + " failed: " + message);
123+
}
118124
failTask(message, null);
119125
}
120126

121127
public void processFailed(Exception e) {
128+
if (logger.isDebugEnabled()) {
129+
logger.debug("Process " + getTask().getIdentity() + " failed: " + e);
130+
}
122131
failTask(null, e);
123132
}
124133

125134
public void statusChanged(int status) {
135+
if (logger.isDebugEnabled()) {
136+
logger.debug("Process " + getTask().getIdentity() + " status changed: " + status);
137+
}
126138
if (status == Job.STATE_RUNNING) {
127139
Job job = executor.getJob();
128140
String location = null;

cogkit/modules/provider-localscheduler/src/org/globus/cog/abstraction/impl/scheduler/common/Job.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ public boolean close(int tentativeState) {
8484
}
8585
File f = null;
8686
if (exitcodeFileName != null) {
87+
if (logger.isDebugEnabled()) {
88+
logger.debug("Reading exit code file for " + jobID);
89+
}
8790
f = new File(exitcodeFileName);
8891
if (f != null && !f.exists()) {
8992
if (ticks == 5) {
@@ -100,6 +103,11 @@ else if (exitcode != NO_EXITCODE) {
100103
f.delete();
101104
}
102105
}
106+
else {
107+
if (logger.isDebugEnabled()) {
108+
logger.debug("Job " + jobID + " has no exit code file");
109+
}
110+
}
103111

104112
processExitCode(tentativeState);
105113
return true;

0 commit comments

Comments
 (0)