Open
Description
Hi Folks,
I am running the below MPI program,
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
int main(int argc, char** argv)
{
char *data;
int size, sender_rank, receiver_rank_start, receiver_rank_end, world_rank, world_size;
int iterations, i, rank, participating_ranks[1024], participating_ranks_size;
long page_size;
MPI_Group world_group, participating_ranks_group;
MPI_Comm participating_ranks_comm;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
size = strtol(argv[1], NULL, 0);
sender_rank = strtol(argv[2], NULL, 0);
receiver_rank_start = strtol(argv[3], NULL, 0);
receiver_rank_end = strtol(argv[4], NULL, 0);
iterations = strtol(argv[5], NULL, 0);
participating_ranks[0] = sender_rank;
for (i = 1, rank = receiver_rank_start; rank <= receiver_rank_end; rank++, i++) {
participating_ranks[i] = rank;
}
participating_ranks_size = (receiver_rank_end - receiver_rank_start) + 2;
MPI_Group_incl(world_group, participating_ranks_size, participating_ranks, &participating_ranks_group);
MPI_Comm_create_group(MPI_COMM_WORLD, participating_ranks_group, 0, &participating_ranks_comm);
page_size = sysconf(_SC_PAGESIZE);
posix_memalign((void **)&data, page_size, size);
if (world_rank == sender_rank || (world_rank >= receiver_rank_start && world_rank <= receiver_rank_end)) {
for (i = 0; i < iterations; i++) {
if (world_rank == sender_rank) {
memset(data, i, size);
}
if (world_rank == sender_rank) {
for (rank = receiver_rank_start; rank <= receiver_rank_end; rank++) {
MPI_Send(data, size, MPI_CHAR, rank, 0x1234, MPI_COMM_WORLD);
}
} else if (world_rank >= receiver_rank_start && world_rank <= receiver_rank_end) {
MPI_Recv(data, size, MPI_CHAR, sender_rank, 0x1234, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
MPI_Barrier(participating_ranks_comm);
}
}
free(data);
MPI_Group_free(&world_group);
MPI_Group_free(&participating_ranks_group);
if (world_rank == sender_rank || (world_rank >= receiver_rank_start && world_rank <= receiver_rank_end)) {
MPI_Comm_free(&participating_ranks_comm);
}
MPI_Finalize();
}
It fails randomly.
[arunchan@Milan039 ob1_xpmem]$ mpirun -np 128 --map-by core --bind-to core ./send_recv_group 8 16 48 55 200000
[Milan039:4034347] PMIX ERROR: NO-PERMISSIONS in file dstore_base.c at line 237
[arunchan@Milan039 ob1_xpmem]$ mpirun -np 128 --map-by core --bind-to core ./send_recv_group 8 16 48 55 200000
[Milan039:4034741] PMIX ERROR: NO-PERMISSIONS in file dstore_base.c at line 237
[arunchan@Milan039 ob1_xpmem]$ mpirun -np 128 --map-by core --bind-to core ./send_recv_group 8 16 48 55 200000
[Milan039:4035138] PMIX ERROR: NO-PERMISSIONS in file dstore_base.c at line 237
[arunchan@Milan039 ob1_xpmem]$ mpirun -np 128 --map-by core --bind-to core ./send_recv_group 8 16 48 55 200000
[Milan039:4035531] PMIX ERROR: NO-PERMISSIONS in file dstore_base.c at line 237
[arunchan@Milan039 ob1_xpmem]$ mpirun -np 128 --map-by core --bind-to core ./send_recv_group 8 16 48 55 200000
[Milan039:4035923] PMIX ERROR: NO-PERMISSIONS in file dstore_base.c at line 237
[arunchan@Milan039 ob1_xpmem]$ mpirun -np 128 --map-by core --bind-to core ./send_recv_group 8 16 48 55 200000
[Milan039:4036315] PMIX ERROR: NO-PERMISSIONS in file dstore_base.c at line 237
[arunchan@Milan039 ob1_xpmem]$ mpirun -np 128 --map-by core --bind-to core ./send_recv_group 8 16 48 55 200000
[Milan039:4036708] PMIX ERROR: NO-PERMISSIONS in file dstore_base.c at line 237
[arunchan@Milan039 ob1_xpmem]$ mpirun -np 128 --map-by core --bind-to core ./send_recv_group 8 16 48 55 200000
[Milan039:4037115] *** Process received signal ***
[Milan039:4037115] Signal: Bus error (7)
[Milan039:4037115] Signal code: Non-existant physical address (2)
[Milan039:4037115] Failing at address: 0x15550a3267c4
[Milan039:4037115] [ 0] /lib64/libpthread.so.0(+0x12ce0)[0x155554dcece0]
[Milan039:4037115] [ 1] /home/arunchan/openmpi_work/install/ompi_4_1_4_xpmem/lib/openmpi/mca_btl_vader.so(+0x5a58)[0x155548b5ba58]
[Milan039:4037115] [ 2] /home/arunchan/openmpi_work/install/ompi_4_1_4_xpmem/lib/libopen-pal.so.40(opal_progress+0x33)[0x1555544a62c3]
[Milan039:4037115] [ 3] /home/arunchan/openmpi_work/install/ompi_4_1_4_xpmem/lib/libmpi.so.40(ompi_mpi_finalize+0x1a5)[0x1555550307c5]
[Milan039:4037115] [ 4] ./send_recv_group[0x400ecd]
[Milan039:4037115] [ 5] /lib64/libc.so.6(__libc_start_main+0xf3)[0x155554a31cf3]
[Milan039:4037115] [ 6] ./send_recv_group[0x400b1e]
[Milan039:4037115] *** End of error message ***
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 16 with PID 0 on node Milan039 exited on signal 7 (Bus error).
--------------------------------------------------------------------------
[Milan039:4037100] PMIX ERROR: NO-PERMISSIONS in file dstore_base.c at line 237
[Milan039:4037100] PMIX ERROR: NO-PERMISSIONS in file dstore_base.c at line 246
[arunchan@Milan039 ob1_xpmem]$
The same program runs perfectly fine if I compile openmpi without xpmem.
How can I solve this problem? [I want the xpmem support to test performance of ob1]
ompi_info and the topology is attached.
--Arun
topology_ompi_info.txt