Skip to content

Commit

Permalink
Merge pull request flux-framework#1292 from milroy/issue-1284
Browse files Browse the repository at this point in the history
Resource: support partial cancel of resources external to broker ranks
  • Loading branch information
mergify[bot] authored Nov 2, 2024
2 parents 93589f5 + 655dcc3 commit 5ac0773
Show file tree
Hide file tree
Showing 8 changed files with 2,666 additions and 22 deletions.
47 changes: 27 additions & 20 deletions qmanager/policies/base/queue_policy_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -646,14 +646,28 @@ class queue_policy_base_t : public resource_model::queue_adapter_base_t {
case job_state_kind_t::ALLOC_RUNNING:
// deliberately fall through
case job_state_kind_t::RUNNING:
if (cancel (h, job_it->second->id, R, true, full_removal) != 0) {
flux_log_error (flux_h,
"%s: .free RPC partial cancel failed for jobid "
"%jd",
__FUNCTION__,
static_cast<intmax_t> (id));
errno = EINVAL;
goto out;
if (!final) {
if (cancel (h, job_it->second->id, R, true, full_removal) != 0) {
flux_log_error (flux_h,
"%s: .free RPC partial cancel failed for jobid "
"%jd",
__FUNCTION__,
static_cast<intmax_t> (id));
errno = EINVAL;
goto out;
}
} else {
// Run a full cancel to clean up all remaining allocated resources
if (cancel (h, job_it->second->id, true) != 0) {
flux_log_error (flux_h,
"%s: .free RPC full cancel failed for jobid "
"%jd",
__FUNCTION__,
static_cast<intmax_t> (id));
errno = EPROTO;
goto out;
}
full_removal = true;
}
// We still want to run the sched loop even if there's an inconsistent state
set_schedulability (true);
Expand All @@ -664,24 +678,17 @@ class queue_policy_base_t : public resource_model::queue_adapter_base_t {
job_it->second->state = job_state_kind_t::COMPLETE;
// hold a reference to the shared_ptr to keep it alive
// during cancel
auto job_sp = job_it->second;
m_jobs.erase (job_it);
if (final && !full_removal) {
// This error condition indicates a discrepancy between core and sched.
if (full_removal && !final) {
// This error condition can indicate a discrepancy between core and sched,
// specifically that a partial cancel removed an allocation prior to
// receiving the final .free RPC from core.
flux_log_error (flux_h,
"%s: Final .free RPC failed to remove all resources for "
"%s: removed allocation before final .free RPC for "
"jobid "
"%jd",
__FUNCTION__,
static_cast<intmax_t> (id));
// Run a full cancel to clean up all remaining allocated resources
if (cancel (h, job_sp->id, true) != 0) {
flux_log_error (flux_h,
"%s: .free RPC full cancel failed for jobid "
"%jd",
__FUNCTION__,
static_cast<intmax_t> (id));
}
errno = EPROTO;
goto out;
}
Expand Down
9 changes: 7 additions & 2 deletions resource/planner/c/planner_multi_c_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -423,8 +423,13 @@ extern "C" int planner_multi_rem_span (planner_multi_t *ctx, int64_t span_id)
goto done;
}
for (i = 0; i < it->second.size (); ++i) {
if (planner_rem_span (ctx->plan_multi->get_planner_at (i), it->second[i]) == -1)
goto done;
// If executed after partial cancel, depending on pruning filter settings
// some spans may no longer exist. In that case the span_lookup value for
// the resource type will be -1.
if (it->second[i] != -1) {
if (planner_rem_span (ctx->plan_multi->get_planner_at (i), it->second[i]) == -1)
goto done;
}
}
ctx->plan_multi->get_span_lookup ().erase (it);
rc = 0;
Expand Down
6 changes: 6 additions & 0 deletions resource/traversers/dfu_impl_update.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,12 @@ int dfu_impl_t::mod_plan (vtx_t u, int64_t jobid, modify_data_t &mod_data)
span = alloc_span->second;
if (mod_data.mod_type != job_modify_t::PARTIAL_CANCEL) {
(*m_graph)[u].schedule.allocations.erase (alloc_span);
} else {
// This condition is encountered when the vertex is
// not associated with a broker rank. We may need
// extra logic here to handle more advanced partial
// cancel in the future.
goto done;
}
} else if ((res_span = (*m_graph)[u].schedule.reservations.find (jobid))
!= (*m_graph)[u].schedule.reservations.end ()) {
Expand Down
1 change: 1 addition & 0 deletions t/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ set(ALL_TESTS
t1024-alloc-check.t
t1025-rv1-reload.t
t1026-rv1-partial-release.t
t1027-rv1-partial-release-brokerless-resources.t
t3000-jobspec.t
t3001-resource-basic.t
t3002-resource-prefix.t
Expand Down
Loading

0 comments on commit 5ac0773

Please sign in to comment.