Skip to content

Commit b565bbe

Browse files
author
Ralph Castain
committed
Detect that we have a mix of BE/LE in the system, provide a warning that OMPI doesn't currently support this environment, and error out
Signed-off-by: Ralph Castain <rhc@open-mpi.org> (cherry picked from commit 2753f53)
1 parent ebf2181 commit b565bbe

File tree

4 files changed

+58
-9
lines changed

4 files changed

+58
-9
lines changed

configure.ac

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -588,7 +588,7 @@ AC_CACHE_SAVE
588588
opal_show_title "Header file tests"
589589

590590
AC_CHECK_HEADERS([alloca.h aio.h arpa/inet.h dirent.h \
591-
dlfcn.h execinfo.h err.h fcntl.h grp.h libgen.h \
591+
dlfcn.h endian.h execinfo.h err.h fcntl.h grp.h libgen.h \
592592
libutil.h memory.h netdb.h netinet/in.h netinet/tcp.h \
593593
poll.h pthread.h pty.h pwd.h sched.h \
594594
strings.h stropts.h linux/ethtool.h linux/sockios.h \

opal/mca/hwloc/base/hwloc_base_util.c

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
#ifdef HAVE_UNISTD_H
3333
#include <unistd.h>
3434
#endif
35+
#ifdef HAVE_ENDIAN_H
36+
#include <endian.h>
37+
#endif
3538

3639
#include "opal/runtime/opal.h"
3740
#include "opal/constants.h"
@@ -2163,7 +2166,7 @@ int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* device_name, op
21632166
char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
21642167
{
21652168
int nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt;
2166-
char *sig=NULL, *arch=NULL;
2169+
char *sig=NULL, *arch = NULL, *endian;
21672170
hwloc_obj_t obj;
21682171
unsigned i;
21692172

@@ -2183,14 +2186,22 @@ char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
21832186
break;
21842187
}
21852188
}
2186-
21872189
if (NULL == arch) {
2188-
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH",
2189-
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt);
2190-
} else {
2191-
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s",
2192-
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch);
2190+
arch = "unknown";
21932191
}
2192+
2193+
#ifdef __BYTE_ORDER
2194+
#if __BYTE_ORDER == __LITTLE_ENDIAN
2195+
endian = "le";
2196+
#else
2197+
endian = "be";
2198+
#endif
2199+
#else
2200+
endian = "unknown";
2201+
#endif
2202+
2203+
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s:%s",
2204+
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch, endian);
21942205
return sig;
21952206
}
21962207

orte/mca/plm/base/help-plm-base.txt

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13-
# Copyright (c) 2015 Intel, Inc. All rights reserved.
13+
# Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
1414
# $COPYRIGHT$
1515
#
1616
# Additional copyrights may follow
@@ -162,3 +162,14 @@ A call was made to launch additional processes, but this process has
162162
no active out-of-band transports and therefore cannot execute this call.
163163
Please check to see if you have the "oob" MCA parameter set and ensure
164164
that it is either unset or at least includes the tcp transport.
165+
#
166+
[multi-endian]
167+
Open MPI does not currently support multi-endian operations. We have
168+
detected that the following node differs in endianness:
169+
170+
171+
Nodename: %s
172+
Endian: %s
173+
Local endian: %s
174+
175+
Please correct the situation and try again.

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,12 +1055,23 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
10551055
int i;
10561056
bool found;
10571057
orte_daemon_cmd_flag_t cmd;
1058+
char *myendian;
10581059

10591060
/* get the daemon job, if necessary */
10601061
if (NULL == jdatorted) {
10611062
jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
10621063
}
10631064

1065+
/* get my endianness */
1066+
t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
1067+
if (NULL == t) {
1068+
/* should never happen */
1069+
myendian = "unknown";
1070+
} else {
1071+
myendian = strrchr(t->sig, ':');
1072+
++myendian;
1073+
}
1074+
10641075
/* multiple daemons could be in this buffer, so unpack until we exhaust the data */
10651076
idx = 1;
10661077
while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx, ORTE_NAME))) {
@@ -1240,8 +1251,24 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
12401251
}
12411252
free(sig);
12421253
break;
1254+
} else {
1255+
/* check if the difference is due to the endianness */
1256+
ptr = strrchr(sig, ':');
1257+
++ptr;
1258+
if (0 != strcmp(ptr, myendian)) {
1259+
/* we don't currently handle multi-endian operations in the
1260+
* MPI support */
1261+
orte_show_help("help-plm-base", "multi-endian", true,
1262+
nodename, ptr, myendian);
1263+
orted_failed_launch = true;
1264+
if (NULL != topo) {
1265+
hwloc_topology_destroy(topo);
1266+
}
1267+
goto CLEANUP;
1268+
}
12431269
}
12441270
}
1271+
12451272
if (!found) {
12461273
/* nope - save the signature and request the complete topology from that node */
12471274
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,

0 commit comments

Comments
 (0)