Skip to content

Commit 9371a6a

Browse files
committed
Merge pull request #1673 from hjelmn/fix_rcache_deadlock
rcache: fix deadlock in multi-threaded environments
2 parents f693886 + ab8ed17 commit 9371a6a

File tree

7 files changed

+345
-202
lines changed

7 files changed

+345
-202
lines changed

opal/mca/btl/vader/btl_vader_module.c

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,17 @@ static void mca_btl_vader_endpoint_constructor (mca_btl_vader_endpoint_t *ep)
539539
ep->fifo = NULL;
540540
}
541541

542+
#if OPAL_BTL_VADER_HAVE_XPMEM
543+
static int mca_btl_vader_endpoint_rcache_cleanup (mca_rcache_base_registration_t *reg, void *ctx)
544+
{
545+
mca_rcache_base_vma_module_t *vma_module = (mca_rcache_base_vma_module_t *) ctx;
546+
/* otherwise dereg will fail on assert */
547+
reg->ref_count = 0;
548+
(void) mca_rcache_base_vma_delete (vma_module, reg);
549+
return OPAL_SUCCESS;
550+
}
551+
#endif
552+
542553
static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
543554
{
544555
OBJ_DESTRUCT(&ep->pending_frags);
@@ -548,21 +559,11 @@ static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
548559
if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) {
549560
if (ep->segment_data.xpmem.vma_module) {
550561
/* clean out the registration cache */
551-
const int nregs = 100;
552-
mca_rcache_base_registration_t *regs[nregs];
553-
int reg_cnt;
554-
555-
do {
556-
reg_cnt = mca_rcache_base_vma_find_all (ep->segment_data.xpmem.vma_module,
557-
0, (size_t) -1, regs, nregs);
558-
for (int i = 0 ; i < reg_cnt ; ++i) {
559-
/* otherwise dereg will fail on assert */
560-
regs[i]->ref_count = 0;
561-
OBJ_RELEASE(regs[i]);
562-
}
563-
} while (reg_cnt == nregs);
564-
565-
ep->segment_data.xpmem.vma_module = NULL;
562+
(void) mca_rcache_base_vma_iterate (ep->segment_data.xpmem.vma_module,
563+
NULL, (size_t) -1,
564+
mca_btl_vader_endpoint_rcache_cleanup,
565+
(void *) ep->segment_data.xpmem.vma_module);
566+
OBJ_RELEASE(ep->segment_data.xpmem.vma_module);
566567
}
567568

568569
if (ep->segment_base) {

opal/mca/rcache/base/rcache_base_vma.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* Copyright (c) 2009-2013 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2009 IBM Corporation. All rights reserved.
1616
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
17-
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
17+
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
1818
* reserved.
1919
*
2020
* $COPYRIGHT$
@@ -144,6 +144,14 @@ int mca_rcache_base_vma_delete (mca_rcache_base_vma_module_t *vma_module,
144144
return mca_rcache_base_vma_tree_delete (vma_module, reg);
145145
}
146146

147+
int mca_rcache_base_vma_iterate (mca_rcache_base_vma_module_t *vma_module,
148+
unsigned char *base, size_t size,
149+
int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
150+
void *ctx)
151+
{
152+
return mca_rcache_base_vma_tree_iterate (vma_module, base, size, callback_fn, ctx);
153+
}
154+
147155
void mca_rcache_base_vma_dump_range (mca_rcache_base_vma_module_t *vma_module,
148156
unsigned char *base, size_t size, char *msg)
149157
{

opal/mca/rcache/base/rcache_base_vma.h

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
*
1414
* Copyright (c) 2006 Voltaire. All rights reserved.
1515
* Copyright (c) 2009 IBM Corporation. All rights reserved.
16-
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
16+
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
1717
* reserved.
1818
*
1919
* $COPYRIGHT$
@@ -34,6 +34,7 @@
3434
#include "opal_config.h"
3535
#include "opal/class/opal_list.h"
3636
#include "opal/class/opal_rb_tree.h"
37+
#include "opal/class/opal_lifo.h"
3738

3839
BEGIN_C_DECLS
3940

@@ -69,6 +70,26 @@ int mca_rcache_base_vma_delete (mca_rcache_base_vma_module_t *vma_module,
6970
void mca_rcache_base_vma_dump_range (mca_rcache_base_vma_module_t *vma_module,
7071
unsigned char *base, size_t size, char *msg);
7172

73+
/**
74+
* Iterate over registrations in the specified range.
75+
*
76+
* @param[in] vma_module vma tree
77+
* @param[in] base base address of region
78+
* @param[in] size size of region
79+
* @param[in] callback_fn function to call for each matching registration handle
80+
* @param[in] ctx callback context
81+
*
82+
* The callback will be made with the vma lock held. This is a recursive lock so
83+
* it is still safe to call any vma functions on this vma_module. Keep in mind it
84+
* is only safe to call mca_rcache_base_vma_delete() on the supplied registration
85+
* from the callback. The iteration will terminate if the callback returns anything
86+
* other than OPAL_SUCCESS.
87+
*/
88+
int mca_rcache_base_vma_iterate (mca_rcache_base_vma_module_t *vma_module,
89+
unsigned char *base, size_t size,
90+
int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
91+
void *ctx);
92+
7293
END_C_DECLS
7394

7495
#endif /* MCA_RCACHE_BASE_VMA_H */

opal/mca/rcache/base/rcache_base_vma_tree.c

Lines changed: 101 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -258,9 +258,12 @@ mca_rcache_base_registration_t *mca_rcache_base_vma_tree_find (mca_rcache_base_v
258258
mca_rcache_base_vma_item_t *vma;
259259
mca_rcache_base_vma_reg_list_item_t *item;
260260

261+
opal_mutex_lock (&vma_module->vma_lock);
262+
261263
vma = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree, base,
262264
mca_rcache_base_vma_tree_node_compare_search);
263265
if (!vma) {
266+
opal_mutex_unlock (&vma_module->vma_lock);
264267
return NULL;
265268
}
266269

@@ -269,12 +272,18 @@ mca_rcache_base_registration_t *mca_rcache_base_vma_tree_find (mca_rcache_base_v
269272
continue;
270273
}
271274

272-
if(item->reg->bound >= bound)
275+
if(item->reg->bound >= bound) {
276+
opal_mutex_unlock (&vma_module->vma_lock);
273277
return item->reg;
274-
if(!(item->reg->flags & MCA_RCACHE_FLAGS_PERSIST))
278+
}
279+
280+
if(!(item->reg->flags & MCA_RCACHE_FLAGS_PERSIST)) {
275281
break;
282+
}
276283
}
277284

285+
opal_mutex_unlock (&vma_module->vma_lock);
286+
278287
return NULL;
279288
}
280289

@@ -299,6 +308,8 @@ int mca_rcache_base_vma_tree_find_all (mca_rcache_base_vma_module_t *vma_module,
299308
if(opal_list_get_size(&vma_module->vma_list) == 0)
300309
return cnt;
301310

311+
opal_mutex_lock (&vma_module->vma_lock);
312+
302313
do {
303314
mca_rcache_base_vma_item_t *vma;
304315
mca_rcache_base_vma_reg_list_item_t *vma_item;
@@ -316,25 +327,23 @@ int mca_rcache_base_vma_tree_find_all (mca_rcache_base_vma_module_t *vma_module,
316327
}
317328

318329
OPAL_LIST_FOREACH(vma_item, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
319-
if ((vma_item->reg->flags & MCA_RCACHE_FLAGS_INVALID) ||
330+
if (vma_item->reg->flags & MCA_RCACHE_FLAGS_INVALID ||
320331
is_reg_in_array (regs, cnt, vma_item->reg)) {
321332
continue;
322333
}
323334
regs[cnt++] = vma_item->reg;
324335
if (cnt == reg_cnt) {
336+
opal_mutex_unlock (&vma_module->vma_lock);
325337
return cnt; /* no space left in the provided array */
326338
}
327339
}
328340

329341
base = (unsigned char *)vma->end + 1;
330-
} while(bound >= base);
342+
} while (bound >= base);
331343

332-
return cnt;
333-
}
344+
opal_mutex_unlock (&vma_module->vma_lock);
334345

335-
static inline int mca_rcache_base_vma_can_insert (mca_rcache_base_vma_module_t *vma_module, size_t nbytes, size_t limit)
336-
{
337-
return (0 == limit || vma_module->reg_cur_cache_size + nbytes <= limit);
346+
return cnt;
338347
}
339348

340349
static inline void mca_rcache_base_vma_update_byte_count (mca_rcache_base_vma_module_t *vma_module,
@@ -343,12 +352,74 @@ static inline void mca_rcache_base_vma_update_byte_count (mca_rcache_base_vma_mo
343352
vma_module->reg_cur_cache_size += nbytes;
344353
}
345354

355+
int mca_rcache_base_vma_tree_iterate (mca_rcache_base_vma_module_t *vma_module, unsigned char *base,
356+
size_t size, int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
357+
void *ctx)
358+
{
359+
unsigned char *bound = base + size - 1;
360+
mca_rcache_base_vma_item_t *vma;
361+
int rc = OPAL_SUCCESS;
362+
363+
if (opal_list_get_size(&vma_module->vma_list) == 0) {
364+
/* nothin to do */
365+
return OPAL_SUCCESS;
366+
}
367+
368+
opal_mutex_lock (&vma_module->vma_lock);
369+
370+
do {
371+
mca_rcache_base_vma_reg_list_item_t *vma_item, *next;
372+
vma = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree, base,
373+
mca_rcache_base_vma_tree_node_compare_closest);
374+
375+
if (NULL == vma) {
376+
/* base is bigger than any registered memory */
377+
break;
378+
}
379+
380+
if (base < (unsigned char *) vma->start) {
381+
base = (unsigned char *) vma->start;
382+
continue;
383+
}
384+
385+
base = (unsigned char *)vma->end + 1;
386+
387+
/* all the registrations in the vma may be deleted by the callback so keep a
388+
* reference until we are done with it. */
389+
OBJ_RETAIN(vma);
390+
391+
OPAL_LIST_FOREACH_SAFE(vma_item, next, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
392+
rc = callback_fn (vma_item->reg, ctx);
393+
if (OPAL_SUCCESS != rc) {
394+
break;
395+
}
396+
}
397+
398+
OBJ_RELEASE(vma);
399+
400+
if (OPAL_SUCCESS != rc) {
401+
break;
402+
}
403+
} while (bound >= base);
404+
405+
opal_mutex_unlock (&vma_module->vma_lock);
406+
407+
return rc;
408+
}
409+
410+
static inline int mca_rcache_base_vma_can_insert (mca_rcache_base_vma_module_t *vma_module, size_t nbytes, size_t limit)
411+
{
412+
return (0 == limit || vma_module->reg_cur_cache_size + nbytes <= limit);
413+
}
414+
346415
int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
347416
mca_rcache_base_registration_t *reg, size_t limit)
348417
{
349418
mca_rcache_base_vma_item_t *i;
350419
uintptr_t begin = (uintptr_t)reg->base, end = (uintptr_t)reg->bound;
351420

421+
opal_mutex_lock (&vma_module->vma_lock);
422+
352423
i = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree,
353424
(void *) begin, mca_rcache_base_vma_tree_node_compare_closest);
354425

@@ -373,6 +444,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
373444
opal_list_append(&vma_module->vma_list, &vma->super);
374445
begin = vma->end + 1;
375446
mca_rcache_base_vma_add_reg (vma, reg);
447+
opal_mutex_unlock (&vma_module->vma_lock);
376448
return OPAL_SUCCESS;
377449
}
378450

@@ -434,10 +506,14 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
434506
i = (mca_rcache_base_vma_item_t *) opal_list_get_next (&i->super);
435507
}
436508

509+
opal_mutex_unlock (&vma_module->vma_lock);
510+
437511
return OPAL_SUCCESS;
438512

439513
remove:
440514
mca_rcache_base_vma_tree_delete (vma_module, reg);
515+
opal_mutex_unlock (&vma_module->vma_lock);
516+
441517
return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
442518
}
443519

@@ -453,17 +529,23 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
453529
mca_rcache_base_registration_t *reg)
454530
{
455531
mca_rcache_base_vma_item_t *vma;
532+
opal_list_t deleted_vmas;
533+
534+
opal_mutex_lock (&vma_module->vma_lock);
456535

457536
vma = (mca_rcache_base_vma_item_t *)
458537
opal_rb_tree_find_with (&vma_module->rb_tree, reg->base,
459538
mca_rcache_base_vma_tree_node_compare_search);
460539

461540
if (!vma) {
541+
opal_mutex_unlock (&vma_module->vma_lock);
462542
return OPAL_ERROR;
463543
}
464544

545+
OBJ_CONSTRUCT(&deleted_vmas, opal_list_t);
546+
465547
while (vma != (mca_rcache_base_vma_item_t *) opal_list_get_end (&vma_module->vma_list)
466-
&& vma->start <= (uintptr_t) reg->bound) {
548+
&& vma->start <= (uintptr_t) reg->bound) {
467549
mca_rcache_base_vma_remove_reg(vma, reg);
468550

469551
if(opal_list_is_empty(&vma->reg_list)) {
@@ -473,7 +555,7 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
473555
mca_rcache_base_vma_update_byte_count (vma_module,
474556
vma->start - vma->end - 1);
475557
opal_list_remove_item (&vma_module->vma_list, &vma->super);
476-
OBJ_RELEASE(vma);
558+
opal_list_append (&deleted_vmas, &vma->super);
477559
vma = next;
478560
} else {
479561
int merged;
@@ -491,7 +573,7 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
491573
prev->end = vma->end;
492574
opal_list_remove_item(&vma_module->vma_list, &vma->super);
493575
opal_rb_tree_delete(&vma_module->rb_tree, vma);
494-
OBJ_RELEASE(vma);
576+
opal_list_append (&deleted_vmas, &vma->super);
495577
vma = prev;
496578
merged = 1;
497579
}
@@ -505,7 +587,7 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
505587
vma->end = next->end;
506588
opal_list_remove_item(&vma_module->vma_list, &next->super);
507589
opal_rb_tree_delete(&vma_module->rb_tree, next);
508-
OBJ_RELEASE(next);
590+
opal_list_append (&deleted_vmas, &next->super);
509591
merged = 1;
510592
}
511593
} while (merged);
@@ -514,6 +596,11 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
514596
}
515597
}
516598

599+
opal_mutex_unlock (&vma_module->vma_lock);
600+
601+
/* actually free vmas now that the lock has been dropped */
602+
OPAL_LIST_DESTRUCT(&deleted_vmas);
603+
517604
return 0;
518605
}
519606

@@ -558,7 +645,7 @@ void mca_rcache_base_vma_tree_dump_range (mca_rcache_base_vma_module_t *vma_modu
558645
OPAL_LIST_FOREACH(vma_item, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
559646
reg = vma_item->reg;
560647
opal_output(0, " reg: base=%p, bound=%p, ref_count=%d, flags=0x%x",
561-
reg->base, reg->bound, reg->ref_count, reg->flags);
648+
(void *) reg->base, (void *) reg->bound, reg->ref_count, reg->flags);
562649
}
563650
base = (unsigned char *)vma->end + 1;
564651
} while (bound >= base);

opal/mca/rcache/base/rcache_base_vma_tree.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* Copyright (c) 2009 IBM Corporation. All rights reserved.
1616
*
1717
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
18-
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
18+
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
1919
* reserved.
2020
* $COPYRIGHT$
2121
*
@@ -106,4 +106,12 @@ void mca_rcache_base_vma_tree_dump_range (mca_rcache_base_vma_module_t *vma_modu
106106
unsigned char *base, size_t size, char *msg);
107107

108108

109+
/*
110+
* Iterate over matching registration handles in the tree.
111+
*/
112+
int mca_rcache_base_vma_tree_iterate (mca_rcache_base_vma_module_t *vma_module,
113+
unsigned char *base, size_t size,
114+
int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
115+
void *ctx);
116+
109117
#endif /* MCA_RCACHE_BASE_VMA_TREE_H */

opal/mca/rcache/grdma/rcache_grdma.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ struct mca_rcache_grdma_cache_t {
4040
opal_list_item_t super;
4141
char *cache_name;
4242
opal_list_t lru_list;
43-
opal_list_t gc_list;
43+
opal_lifo_t gc_lifo;
4444
mca_rcache_base_vma_module_t *vma_module;
4545
};
4646
typedef struct mca_rcache_grdma_cache_t mca_rcache_grdma_cache_t;

0 commit comments

Comments
 (0)