Skip to content

Commit

Permalink
dlm: fix waiter recovery
Browse files Browse the repository at this point in the history
An outstanding remote operation (an lkb on the "waiter"
list) could sometimes miss being resent during recovery.
The decision was based on the lkb_nodeid field, which
could have changed during an earlier aborted recovery,
so it no longer represents the actual remote destination.
The lkb_wait_nodeid is always the actual remote node,
so it is the best value to use.

Signed-off-by: David Teigland <teigland@redhat.com>
  • Loading branch information
teigland committed Apr 26, 2012
1 parent 513ef59 commit 13ef111
Showing 1 changed file with 31 additions and 12 deletions.
43 changes: 31 additions & 12 deletions fs/dlm/lock.c
Original file line number Diff line number Diff line change
Expand Up @@ -4187,15 +4187,19 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
/* A waiting lkb needs recovery if the master node has failed, or
the master node is changing (only when no directory is used) */

static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb,
int dir_nodeid)
{
if (dlm_is_removed(ls, lkb->lkb_nodeid))
if (dlm_is_removed(ls, lkb->lkb_wait_nodeid))
return 1;

if (!dlm_no_directory(ls))
return 0;

if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
if (dir_nodeid == dlm_our_nodeid())
return 1;

if (dir_nodeid != lkb->lkb_wait_nodeid)
return 1;

return 0;
Expand All @@ -4212,6 +4216,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
struct dlm_lkb *lkb, *safe;
struct dlm_message *ms_stub;
int wait_type, stub_unlock_result, stub_cancel_result;
int dir_nodeid;

ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL);
if (!ms_stub) {
Expand All @@ -4223,13 +4228,21 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)

list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {

dir_nodeid = dlm_dir_nodeid(lkb->lkb_resource);

/* exclude debug messages about unlocks because there can be so
many and they aren't very interesting */

if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
log_debug(ls, "recover_waiter %x nodeid %d "
"msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid,
lkb->lkb_wait_type, lkb->lkb_wait_nodeid);
log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d "
"lkb_nodeid %d wait_nodeid %d dir_nodeid %d",
lkb->lkb_id,
lkb->lkb_remid,
lkb->lkb_wait_type,
lkb->lkb_resource->res_nodeid,
lkb->lkb_nodeid,
lkb->lkb_wait_nodeid,
dir_nodeid);
}

/* all outstanding lookups, regardless of destination will be
Expand All @@ -4240,7 +4253,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
continue;
}

if (!waiter_needs_recovery(ls, lkb))
if (!waiter_needs_recovery(ls, lkb, dir_nodeid))
continue;

wait_type = lkb->lkb_wait_type;
Expand Down Expand Up @@ -4373,8 +4386,11 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
ou = is_overlap_unlock(lkb);
err = 0;

log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d",
lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid);
log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d "
"lkb_nodeid %d wait_nodeid %d dir_nodeid %d "
"overlap %d %d", lkb->lkb_id, lkb->lkb_remid, mstype,
r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid,
dlm_dir_nodeid(r), oc, ou);

/* At this point we assume that we won't get a reply to any
previous op or overlap op on this lock. First, do a big
Expand Down Expand Up @@ -4426,9 +4442,12 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
}
}

if (err)
log_error(ls, "recover_waiters_post %x %d %x %d %d",
lkb->lkb_id, mstype, lkb->lkb_flags, oc, ou);
if (err) {
log_error(ls, "waiter %x msg %d r_nodeid %d "
"dir_nodeid %d overlap %d %d",
lkb->lkb_id, mstype, r->res_nodeid,
dlm_dir_nodeid(r), oc, ou);
}
unlock_rsb(r);
put_rsb(r);
dlm_put_lkb(lkb);
Expand Down

0 comments on commit 13ef111

Please sign in to comment.