Skip to content

rcache: fix deadlock in multi-threaded environments #1673

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 18, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 16 additions & 15 deletions opal/mca/btl/vader/btl_vader_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,17 @@ static void mca_btl_vader_endpoint_constructor (mca_btl_vader_endpoint_t *ep)
ep->fifo = NULL;
}

#if OPAL_BTL_VADER_HAVE_XPMEM
static int mca_btl_vader_endpoint_rcache_cleanup (mca_rcache_base_registration_t *reg, void *ctx)
{
mca_rcache_base_vma_module_t *vma_module = (mca_rcache_base_vma_module_t *) ctx;
/* otherwise dereg will fail on assert */
reg->ref_count = 0;
(void) mca_rcache_base_vma_delete (vma_module, reg);
return OPAL_SUCCESS;
}
#endif

static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
{
OBJ_DESTRUCT(&ep->pending_frags);
Expand All @@ -548,21 +559,11 @@ static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) {
if (ep->segment_data.xpmem.vma_module) {
/* clean out the registration cache */
const int nregs = 100;
mca_rcache_base_registration_t *regs[nregs];
int reg_cnt;

do {
reg_cnt = mca_rcache_base_vma_find_all (ep->segment_data.xpmem.vma_module,
0, (size_t) -1, regs, nregs);
for (int i = 0 ; i < reg_cnt ; ++i) {
/* otherwise dereg will fail on assert */
regs[i]->ref_count = 0;
OBJ_RELEASE(regs[i]);
}
} while (reg_cnt == nregs);

ep->segment_data.xpmem.vma_module = NULL;
(void) mca_rcache_base_vma_iterate (ep->segment_data.xpmem.vma_module,
NULL, (size_t) -1,
mca_btl_vader_endpoint_rcache_cleanup,
(void *) ep->segment_data.xpmem.vma_module);
OBJ_RELEASE(ep->segment_data.xpmem.vma_module);
}

if (ep->segment_base) {
Expand Down
10 changes: 9 additions & 1 deletion opal/mca/rcache/base/rcache_base_vma.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* Copyright (c) 2009-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
Expand Down Expand Up @@ -144,6 +144,14 @@ int mca_rcache_base_vma_delete (mca_rcache_base_vma_module_t *vma_module,
return mca_rcache_base_vma_tree_delete (vma_module, reg);
}

int mca_rcache_base_vma_iterate (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size,
int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
void *ctx)
{
return mca_rcache_base_vma_tree_iterate (vma_module, base, size, callback_fn, ctx);
}

void mca_rcache_base_vma_dump_range (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size, char *msg)
{
Expand Down
23 changes: 22 additions & 1 deletion opal/mca/rcache/base/rcache_base_vma.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
*
* Copyright (c) 2006 Voltaire. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
Expand All @@ -34,6 +34,7 @@
#include "opal_config.h"
#include "opal/class/opal_list.h"
#include "opal/class/opal_rb_tree.h"
#include "opal/class/opal_lifo.h"

BEGIN_C_DECLS

Expand Down Expand Up @@ -69,6 +70,26 @@ int mca_rcache_base_vma_delete (mca_rcache_base_vma_module_t *vma_module,
void mca_rcache_base_vma_dump_range (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size, char *msg);

/**
* Iterate over registrations in the specified range.
*
* @param[in] vma_module vma tree
* @param[in] base base address of region
* @param[in] size size of region
* @param[in] callback_fn function to call for each matching registration handle
* @param[in] ctx callback context
*
* The callback will be made with the vma lock held. This is a recursive lock so
* it is still safe to call any vma functions on this vma_module. Keep in mind it
* is only safe to call mca_rcache_base_vma_delete() on the supplied registration
* from the callback. The iteration will terminate if the callback returns anything
* other than OPAL_SUCCESS.
*/
int mca_rcache_base_vma_iterate (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size,
int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
void *ctx);

END_C_DECLS

#endif /* MCA_RCACHE_BASE_VMA_H */
115 changes: 101 additions & 14 deletions opal/mca/rcache/base/rcache_base_vma_tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,12 @@ mca_rcache_base_registration_t *mca_rcache_base_vma_tree_find (mca_rcache_base_v
mca_rcache_base_vma_item_t *vma;
mca_rcache_base_vma_reg_list_item_t *item;

opal_mutex_lock (&vma_module->vma_lock);

vma = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree, base,
mca_rcache_base_vma_tree_node_compare_search);
if (!vma) {
opal_mutex_unlock (&vma_module->vma_lock);
return NULL;
}

Expand All @@ -269,12 +272,18 @@ mca_rcache_base_registration_t *mca_rcache_base_vma_tree_find (mca_rcache_base_v
continue;
}

if(item->reg->bound >= bound)
if(item->reg->bound >= bound) {
opal_mutex_unlock (&vma_module->vma_lock);
return item->reg;
if(!(item->reg->flags & MCA_RCACHE_FLAGS_PERSIST))
}

if(!(item->reg->flags & MCA_RCACHE_FLAGS_PERSIST)) {
break;
}
}

opal_mutex_unlock (&vma_module->vma_lock);

return NULL;
}

Expand All @@ -299,6 +308,8 @@ int mca_rcache_base_vma_tree_find_all (mca_rcache_base_vma_module_t *vma_module,
if(opal_list_get_size(&vma_module->vma_list) == 0)
return cnt;

opal_mutex_lock (&vma_module->vma_lock);

do {
mca_rcache_base_vma_item_t *vma;
mca_rcache_base_vma_reg_list_item_t *vma_item;
Expand All @@ -316,25 +327,23 @@ int mca_rcache_base_vma_tree_find_all (mca_rcache_base_vma_module_t *vma_module,
}

OPAL_LIST_FOREACH(vma_item, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
if ((vma_item->reg->flags & MCA_RCACHE_FLAGS_INVALID) ||
if (vma_item->reg->flags & MCA_RCACHE_FLAGS_INVALID ||
is_reg_in_array (regs, cnt, vma_item->reg)) {
continue;
}
regs[cnt++] = vma_item->reg;
if (cnt == reg_cnt) {
opal_mutex_unlock (&vma_module->vma_lock);
return cnt; /* no space left in the provided array */
}
}

base = (unsigned char *)vma->end + 1;
} while(bound >= base);
} while (bound >= base);

return cnt;
}
opal_mutex_unlock (&vma_module->vma_lock);

static inline int mca_rcache_base_vma_can_insert (mca_rcache_base_vma_module_t *vma_module, size_t nbytes, size_t limit)
{
return (0 == limit || vma_module->reg_cur_cache_size + nbytes <= limit);
return cnt;
}

static inline void mca_rcache_base_vma_update_byte_count (mca_rcache_base_vma_module_t *vma_module,
Expand All @@ -343,12 +352,74 @@ static inline void mca_rcache_base_vma_update_byte_count (mca_rcache_base_vma_mo
vma_module->reg_cur_cache_size += nbytes;
}

int mca_rcache_base_vma_tree_iterate (mca_rcache_base_vma_module_t *vma_module, unsigned char *base,
size_t size, int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
void *ctx)
{
unsigned char *bound = base + size - 1;
mca_rcache_base_vma_item_t *vma;
int rc = OPAL_SUCCESS;

if (opal_list_get_size(&vma_module->vma_list) == 0) {
/* nothin to do */
return OPAL_SUCCESS;
}

opal_mutex_lock (&vma_module->vma_lock);

do {
mca_rcache_base_vma_reg_list_item_t *vma_item, *next;
vma = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree, base,
mca_rcache_base_vma_tree_node_compare_closest);

if (NULL == vma) {
/* base is bigger than any registered memory */
break;
}

if (base < (unsigned char *) vma->start) {
base = (unsigned char *) vma->start;
continue;
}

base = (unsigned char *)vma->end + 1;

/* all the registrations in the vma may be deleted by the callback so keep a
* reference until we are done with it. */
OBJ_RETAIN(vma);

OPAL_LIST_FOREACH_SAFE(vma_item, next, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
rc = callback_fn (vma_item->reg, ctx);
if (OPAL_SUCCESS != rc) {
break;
}
}

OBJ_RELEASE(vma);

if (OPAL_SUCCESS != rc) {
break;
}
} while (bound >= base);

opal_mutex_unlock (&vma_module->vma_lock);

return rc;
}

static inline int mca_rcache_base_vma_can_insert (mca_rcache_base_vma_module_t *vma_module, size_t nbytes, size_t limit)
{
return (0 == limit || vma_module->reg_cur_cache_size + nbytes <= limit);
}

int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
mca_rcache_base_registration_t *reg, size_t limit)
{
mca_rcache_base_vma_item_t *i;
uintptr_t begin = (uintptr_t)reg->base, end = (uintptr_t)reg->bound;

opal_mutex_lock (&vma_module->vma_lock);

i = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree,
(void *) begin, mca_rcache_base_vma_tree_node_compare_closest);

Expand All @@ -373,6 +444,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
opal_list_append(&vma_module->vma_list, &vma->super);
begin = vma->end + 1;
mca_rcache_base_vma_add_reg (vma, reg);
opal_mutex_unlock (&vma_module->vma_lock);
return OPAL_SUCCESS;
}

Expand Down Expand Up @@ -434,10 +506,14 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
i = (mca_rcache_base_vma_item_t *) opal_list_get_next (&i->super);
}

opal_mutex_unlock (&vma_module->vma_lock);

return OPAL_SUCCESS;

remove:
mca_rcache_base_vma_tree_delete (vma_module, reg);
opal_mutex_unlock (&vma_module->vma_lock);

return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
}

Expand All @@ -453,17 +529,23 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
mca_rcache_base_registration_t *reg)
{
mca_rcache_base_vma_item_t *vma;
opal_list_t deleted_vmas;

opal_mutex_lock (&vma_module->vma_lock);

vma = (mca_rcache_base_vma_item_t *)
opal_rb_tree_find_with (&vma_module->rb_tree, reg->base,
mca_rcache_base_vma_tree_node_compare_search);

if (!vma) {
opal_mutex_unlock (&vma_module->vma_lock);
return OPAL_ERROR;
}

OBJ_CONSTRUCT(&deleted_vmas, opal_list_t);

while (vma != (mca_rcache_base_vma_item_t *) opal_list_get_end (&vma_module->vma_list)
&& vma->start <= (uintptr_t) reg->bound) {
&& vma->start <= (uintptr_t) reg->bound) {
mca_rcache_base_vma_remove_reg(vma, reg);

if(opal_list_is_empty(&vma->reg_list)) {
Expand All @@ -473,7 +555,7 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
mca_rcache_base_vma_update_byte_count (vma_module,
vma->start - vma->end - 1);
opal_list_remove_item (&vma_module->vma_list, &vma->super);
OBJ_RELEASE(vma);
opal_list_append (&deleted_vmas, &vma->super);
vma = next;
} else {
int merged;
Expand All @@ -491,7 +573,7 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
prev->end = vma->end;
opal_list_remove_item(&vma_module->vma_list, &vma->super);
opal_rb_tree_delete(&vma_module->rb_tree, vma);
OBJ_RELEASE(vma);
opal_list_append (&deleted_vmas, &vma->super);
vma = prev;
merged = 1;
}
Expand All @@ -505,7 +587,7 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
vma->end = next->end;
opal_list_remove_item(&vma_module->vma_list, &next->super);
opal_rb_tree_delete(&vma_module->rb_tree, next);
OBJ_RELEASE(next);
opal_list_append (&deleted_vmas, &next->super);
merged = 1;
}
} while (merged);
Expand All @@ -514,6 +596,11 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
}
}

opal_mutex_unlock (&vma_module->vma_lock);

/* actually free vmas now that the lock has been dropped */
OPAL_LIST_DESTRUCT(&deleted_vmas);

return 0;
}

Expand Down Expand Up @@ -558,7 +645,7 @@ void mca_rcache_base_vma_tree_dump_range (mca_rcache_base_vma_module_t *vma_modu
OPAL_LIST_FOREACH(vma_item, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
reg = vma_item->reg;
opal_output(0, " reg: base=%p, bound=%p, ref_count=%d, flags=0x%x",
reg->base, reg->bound, reg->ref_count, reg->flags);
(void *) reg->base, (void *) reg->bound, reg->ref_count, reg->flags);
}
base = (unsigned char *)vma->end + 1;
} while (bound >= base);
Expand Down
10 changes: 9 additions & 1 deletion opal/mca/rcache/base/rcache_base_vma_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* Copyright (c) 2009 IBM Corporation. All rights reserved.
*
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
Expand Down Expand Up @@ -106,4 +106,12 @@ void mca_rcache_base_vma_tree_dump_range (mca_rcache_base_vma_module_t *vma_modu
unsigned char *base, size_t size, char *msg);


/*
* Iterate over matching registration handles in the tree.
*/
int mca_rcache_base_vma_tree_iterate (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size,
int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
void *ctx);

#endif /* MCA_RCACHE_BASE_VMA_TREE_H */
2 changes: 1 addition & 1 deletion opal/mca/rcache/grdma/rcache_grdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ struct mca_rcache_grdma_cache_t {
opal_list_item_t super;
char *cache_name;
opal_list_t lru_list;
opal_list_t gc_list;
opal_lifo_t gc_lifo;
mca_rcache_base_vma_module_t *vma_module;
};
typedef struct mca_rcache_grdma_cache_t mca_rcache_grdma_cache_t;
Expand Down
Loading