Closed
Description
This issue tracks a discussion on the use mail list:
https://www.mail-archive.com/users@lists.open-mpi.org//msg33397.html
The test case works with the PML ob1, fails with a PSM2 error if using the PSM2 MTL, fails silently if using the OFI MTL (highly likely using the PSM2 provider).
Test case:
#define _GNU_SOURCE
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <mpi.h>
long failed_offset = 0;
size_t chunk_size = 1 << 16;
size_t nchunks = (1 << 16) + 1;
int main(int argc, char * argv[])
{
if (argc >= 2) chunk_size = atol(argv[1]);
if (argc >= 3) nchunks = atol(argv[1]);
MPI_Init(&argc, &argv);
/*
* This function returns:
* 0 on success.
* a non-zero MPI Error code if MPI_Allgather returned one.
* -1 if no MPI Error code was returned, but the result of Allgather
* was wrong.
* -2 if memory allocation failed.
*
* (note that the MPI document guarantees that MPI error codes are
* positive integers)
*/
int size, rank;
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int err;
char * check_text;
int rc = asprintf(&check_text, "MPI_Allgather, %d nodes, 0x%zx chunks of 0x%zx bytes, total %d * 0x%zx bytes", size, nchunks, chunk_size, size, chunk_size * nchunks);
if (rc < 0) abort();
if (!rank) printf("%s: ...\n", check_text);
MPI_Datatype mpi_ft;
MPI_Type_contiguous(chunk_size, MPI_BYTE, &mpi_ft);
MPI_Type_commit(&mpi_ft);
MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
void * data = malloc(nchunks * size * chunk_size);
memset(data, 0, nchunks * size * chunk_size);
int alloc_ok = data != NULL;
MPI_Allreduce(MPI_IN_PLACE, &alloc_ok, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
if (alloc_ok) {
memset(((char*)data) + nchunks * chunk_size * rank, 0x42, nchunks * chunk_size);
err = MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
data, nchunks,
mpi_ft, MPI_COMM_WORLD);
if (err == 0) {
void * p = memchr(data, 0, nchunks * size * chunk_size);
if (p != NULL) {
/* We found a zero, we shouldn't ! */
err = -1;
failed_offset = ((char*)p)-(char*)data;
}
}
} else {
err = -2;
}
if (data) free(data);
MPI_Type_free(&mpi_ft);
if (!rank) {
printf("%s: %s\n", check_text, err == 0 ? "ok" : "NOK");
}
if (err == -2) {
puts("Could not allocate memory buffer");
} else if (err != 0) {
int someone_has_minusone = (err == -1);
MPI_Allreduce(MPI_IN_PLACE, &someone_has_minusone, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
if (someone_has_minusone) {
long * offsets = malloc(size * sizeof(long));
offsets[rank] = failed_offset;
MPI_Gather(&failed_offset, 1, MPI_LONG,
offsets, 1, MPI_LONG, 0, MPI_COMM_WORLD);
if (!rank) {
for(int i = 0 ; i < size ; i++) {
printf("node %d failed_offset = 0x%lx\n", i, offsets[i]);
}
}
free(offsets);
}
if (!rank) {
if (err > 0) { /* return an MPI Error if we've got one. */
/* we often get MPI_ERR_OTHER... mostly useless */
char error[1024];
int errorlen = sizeof(error);
MPI_Error_string(err, error, &errorlen);
printf("MPI error returned:\n%s\n", error);
}
}
}
free(check_text);
MPI_Finalize();
}