Skip to content

osc/pt2pt hang in master #1299

Closed
Closed
@ggouaillardet

Description

@ggouaillardet

@hjelmn can you please have a look at this ?

here is a reproducer

#include <mpi.h>

int main(int argc, char* argv[])
{
    MPI_Win win;
    double * d, a;

    MPI_Init(&argc, &argv);

    a = 0;
    MPI_Win_allocate(sizeof(double), 1, MPI_INFO_NULL, MPI_COMM_SELF, (void *)&d, &win);
    MPI_Win_lock_all(MPI_MODE_NOCHECK, win);
    *d = 0.;

    MPI_Accumulate(&a, 1, MPI_DOUBLE, 0, 0, 1, MPI_DOUBLE, MPI_SUM, win);

    MPI_Win_flush_all(win);
    MPI_Win_unlock_all(win);
    MPI_Win_free(&win);

    MPI_Finalize();

    return 0;
}

this can be ran with only one MPI task.

it works fine with --mca osc sm on both v1.10 and master
but with --mca osc pt2pt, it works fine on v1.10 but it hangs on master

i ran this under the debugger, and ended up writing this patch so master mimic v1.10.
that being said, i have no idea whether this is correct or not ...

diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c b/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c
index 7e28914..34df3ab 100644
--- a/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c
+++ b/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c
@@ -2,6 +2,8 @@
 /*
  * Copyright (c) 2015      Los Alamos National Security, LLC.  All rights
  *                         reserved.
+ * Copyright (c) 2016      Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -17,6 +19,7 @@ static void ompi_osc_pt2pt_sync_constructor (ompi_osc_pt2pt_sync_t *sync)
     sync->type = OMPI_OSC_PT2PT_SYNC_TYPE_NONE;
     sync->eager_send_active = false;
     sync->epoch_active = false;
+    sync->sync.pscw.group = NULL;
     OBJ_CONSTRUCT(&sync->lock, opal_mutex_t);
     OBJ_CONSTRUCT(&sync->cond, opal_condition_t);
 }
diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_sync.h b/ompi/mca/osc/pt2pt/osc_pt2pt_sync.h
index eee2964..cfed4e9 100644
--- a/ompi/mca/osc/pt2pt/osc_pt2pt_sync.h
+++ b/ompi/mca/osc/pt2pt/osc_pt2pt_sync.h
@@ -2,6 +2,8 @@
 /*
  * Copyright (c) 2015      Los Alamos National Security, LLC.  All rights
  *                         reserved.
+ * Copyright (c) 2016      Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -45,7 +47,7 @@ struct ompi_osc_pt2pt_sync_t {
     ompi_osc_pt2pt_sync_type_t type;

     /** synchronization data */
-    union {
+    struct {
         /** lock specific synchronization data */
         struct {
             /** lock target rank (-1 for all) */
@@ -129,13 +131,15 @@ bool ompi_osc_pt2pt_sync_pscw_peer (struct ompi_osc_pt2pt_module_t *module, int
  */
 static inline void ompi_osc_pt2pt_sync_wait (ompi_osc_pt2pt_sync_t *sync)
 {
-    OPAL_THREAD_LOCK(&sync->lock);
-    while (!sync->eager_send_active) {
-        OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
-                             "waiting for access epoch to start"));
-        opal_condition_wait(&sync->cond, &sync->lock);
+    if (sync->sync.pscw.group) {
+        OPAL_THREAD_LOCK(&sync->lock);
+        while (!sync->eager_send_active) {
+            OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
+                                 "waiting for access epoch to start"));
+            opal_condition_wait(&sync->cond, &sync->lock);
+        }
+        OPAL_THREAD_UNLOCK(&sync->lock);
     }
-    OPAL_THREAD_UNLOCK(&sync->lock);

     OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
                          "access epoch ready"));

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions