Skip to content

Commit dc0f4ad

Browse files
authored
Merge pull request #9622 from jjhursey/v41-libnbc-fix-overflow
v4.1.x: libnbc: Fix int overflow when handling the count parameter
2 parents e87540f + a7eb602 commit dc0f4ad

File tree

5 files changed

+67
-28
lines changed

5 files changed

+67
-28
lines changed

ompi/mca/coll/libnbc/nbc.c

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
* Author(s): Torsten Hoefler <htor@cs.indiana.edu>
1717
*
1818
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
19-
* Copyright (c) 2016 IBM Corporation. All rights reserved.
19+
* Copyright (c) 2016-2021 IBM Corporation. All rights reserved.
2020
* Copyright (c) 2017 Ian Bradley Morgan and Anthony Skjellum. All
2121
* rights reserved.
2222
* Copyright (c) 2018 FUJITSU LIMITED. All rights reserved.
@@ -117,7 +117,7 @@ static int nbc_schedule_round_append (NBC_Schedule *schedule, void *data, int da
117117
}
118118

119119
/* this function puts a send into the schedule */
120-
static int NBC_Sched_send_internal (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, bool local, NBC_Schedule *schedule, bool barrier) {
120+
static int NBC_Sched_send_internal (const void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int dest, bool local, NBC_Schedule *schedule, bool barrier) {
121121
NBC_Args_send send_args;
122122
int ret;
123123

@@ -141,16 +141,16 @@ static int NBC_Sched_send_internal (const void* buf, char tmpbuf, int count, MPI
141141
return OMPI_SUCCESS;
142142
}
143143

144-
int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier) {
144+
int NBC_Sched_send (const void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier) {
145145
return NBC_Sched_send_internal (buf, tmpbuf, count, datatype, dest, false, schedule, barrier);
146146
}
147147

148-
int NBC_Sched_local_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier) {
148+
int NBC_Sched_local_send (const void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier) {
149149
return NBC_Sched_send_internal (buf, tmpbuf, count, datatype, dest, true, schedule, barrier);
150150
}
151151

152152
/* this function puts a receive into the schedule */
153-
static int NBC_Sched_recv_internal (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, bool local, NBC_Schedule *schedule, bool barrier) {
153+
static int NBC_Sched_recv_internal (void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int source, bool local, NBC_Schedule *schedule, bool barrier) {
154154
NBC_Args_recv recv_args;
155155
int ret;
156156

@@ -174,16 +174,16 @@ static int NBC_Sched_recv_internal (void* buf, char tmpbuf, int count, MPI_Datat
174174
return OMPI_SUCCESS;
175175
}
176176

177-
int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier) {
177+
int NBC_Sched_recv (void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier) {
178178
return NBC_Sched_recv_internal(buf, tmpbuf, count, datatype, source, false, schedule, barrier);
179179
}
180180

181-
int NBC_Sched_local_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier) {
181+
int NBC_Sched_local_recv (void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier) {
182182
return NBC_Sched_recv_internal(buf, tmpbuf, count, datatype, source, true, schedule, barrier);
183183
}
184184

185185
/* this function puts an operation into the schedule */
186-
int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype,
186+
int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, size_t count, MPI_Datatype datatype,
187187
MPI_Op op, NBC_Schedule *schedule, bool barrier) {
188188
NBC_Args_op op_args;
189189
int ret;
@@ -210,7 +210,8 @@ int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int
210210
}
211211

212212
/* this function puts a copy into the schedule */
213-
int NBC_Sched_copy (void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount,
213+
int NBC_Sched_copy (void *src, char tmpsrc, size_t srccount, MPI_Datatype srctype,
214+
void *tgt, char tmptgt, size_t tgtcount,
214215
MPI_Datatype tgttype, NBC_Schedule *schedule, bool barrier) {
215216
NBC_Args_copy copy_args;
216217
int ret;
@@ -238,7 +239,7 @@ int NBC_Sched_copy (void *src, char tmpsrc, int srccount, MPI_Datatype srctype,
238239
}
239240

240241
/* this function puts a unpack into the schedule */
241-
int NBC_Sched_unpack (void *inbuf, char tmpinbuf, int count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf,
242+
int NBC_Sched_unpack (void *inbuf, char tmpinbuf, size_t count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf,
242243
NBC_Schedule *schedule, bool barrier) {
243244
NBC_Args_unpack unpack_args;
244245
int ret;
@@ -520,6 +521,7 @@ static inline int NBC_Start_round(NBC_Handle *handle) {
520521
} else {
521522
buf2=opargs.buf2;
522523
}
524+
523525
ompi_op_reduce(opargs.op, buf1, buf2, opargs.count, opargs.datatype);
524526
break;
525527
case COPY:

ompi/mca/coll/libnbc/nbc_internal.h

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1616
* reserved.
1717
* Copyright (c) 2018 FUJITSU LIMITED. All rights reserved.
18+
* Copyright (c) 2021 IBM Corporation. All rights reserved.
1819
* $COPYRIGHT$
1920
*
2021
* Additional copyrights may follow
@@ -90,7 +91,7 @@ typedef enum {
9091
/* the send argument struct */
9192
typedef struct {
9293
NBC_Fn_type type;
93-
int count;
94+
size_t count;
9495
const void *buf;
9596
MPI_Datatype datatype;
9697
int dest;
@@ -101,7 +102,7 @@ typedef struct {
101102
/* the receive argument struct */
102103
typedef struct {
103104
NBC_Fn_type type;
104-
int count;
105+
size_t count;
105106
void *buf;
106107
MPI_Datatype datatype;
107108
char tmpbuf;
@@ -118,26 +119,26 @@ typedef struct {
118119
void *buf2;
119120
MPI_Op op;
120121
MPI_Datatype datatype;
121-
int count;
122+
size_t count;
122123
} NBC_Args_op;
123124

124125
/* the copy argument struct */
125126
typedef struct {
126127
NBC_Fn_type type;
127-
int srccount;
128+
size_t srccount;
128129
void *src;
129130
void *tgt;
130131
MPI_Datatype srctype;
131132
MPI_Datatype tgttype;
132-
int tgtcount;
133+
size_t tgtcount;
133134
char tmpsrc;
134135
char tmptgt;
135136
} NBC_Args_copy;
136137

137138
/* unpack operation arguments */
138139
typedef struct {
139140
NBC_Fn_type type;
140-
int count;
141+
size_t count;
141142
void *inbuf;
142143
void *outbuf;
143144
MPI_Datatype datatype;
@@ -146,15 +147,15 @@ typedef struct {
146147
} NBC_Args_unpack;
147148

148149
/* internal function prototypes */
149-
int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier);
150-
int NBC_Sched_local_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest,NBC_Schedule *schedule, bool barrier);
151-
int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier);
152-
int NBC_Sched_local_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier);
153-
int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype,
150+
int NBC_Sched_send (const void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier);
151+
int NBC_Sched_local_send (const void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int dest,NBC_Schedule *schedule, bool barrier);
152+
int NBC_Sched_recv (void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier);
153+
int NBC_Sched_local_recv (void* buf, char tmpbuf, size_t count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier);
154+
int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, size_t count, MPI_Datatype datatype,
154155
MPI_Op op, NBC_Schedule *schedule, bool barrier);
155-
int NBC_Sched_copy (void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount,
156+
int NBC_Sched_copy (void *src, char tmpsrc, size_t srccount, MPI_Datatype srctype, void *tgt, char tmptgt, size_t tgtcount,
156157
MPI_Datatype tgttype, NBC_Schedule *schedule, bool barrier);
157-
int NBC_Sched_unpack (void *inbuf, char tmpinbuf, int count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf,
158+
int NBC_Sched_unpack (void *inbuf, char tmpinbuf, size_t count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf,
158159
NBC_Schedule *schedule, bool barrier);
159160

160161
int NBC_Sched_barrier (NBC_Schedule *schedule);

ompi/mca/coll/libnbc/nbc_ireduce_scatter.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@
4545
static int nbc_reduce_scatter_init(const void* sendbuf, void* recvbuf, const int *recvcounts, MPI_Datatype datatype,
4646
MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request,
4747
struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
48-
int peer, rank, maxr, p, res, count;
48+
int peer, rank, maxr, p, res;
49+
size_t count;
4950
MPI_Aint ext;
5051
ptrdiff_t gap, span, span_align;
5152
char *sbuf, inplace;
@@ -230,7 +231,8 @@ int ompi_coll_libnbc_ireduce_scatter (const void* sendbuf, void* recvbuf, const
230231
static int nbc_reduce_scatter_inter_init (const void* sendbuf, void* recvbuf, const int *recvcounts, MPI_Datatype datatype,
231232
MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request,
232233
struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
233-
int rank, res, count, lsize, rsize;
234+
int rank, res, lsize, rsize;
235+
size_t count;
234236
MPI_Aint ext;
235237
ptrdiff_t gap, span, span_align;
236238
NBC_Schedule *schedule;

ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@
4343
static int nbc_reduce_scatter_block_init(const void* sendbuf, void* recvbuf, int recvcount, MPI_Datatype datatype,
4444
MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request,
4545
struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
46-
int peer, rank, maxr, p, res, count;
46+
int peer, rank, maxr, p, res;
47+
size_t count;
4748
MPI_Aint ext;
4849
ptrdiff_t gap, span;
4950
char *redbuf, *sbuf, inplace;
@@ -229,7 +230,8 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i
229230
static int nbc_reduce_scatter_block_inter_init(const void *sendbuf, void *recvbuf, int rcount, struct ompi_datatype_t *dtype,
230231
struct ompi_op_t *op, struct ompi_communicator_t *comm, ompi_request_t **request,
231232
struct mca_coll_base_module_2_3_0_t *module, bool persistent) {
232-
int rank, res, count, lsize, rsize;
233+
int rank, res, lsize, rsize;
234+
size_t count;
233235
MPI_Aint ext;
234236
ptrdiff_t gap, span, span_align;
235237
NBC_Schedule *schedule;

ompi/op/op.h

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
* reserved.
1818
* Copyright (c) 2019 Research Organization for Information Science
1919
* and Technology (RIST). All rights reserved.
20+
* Copyright (c) 2021 IBM Corporation. All rights reserved.
2021
* $COPYRIGHT$
2122
*
2223
* Additional copyrights may follow
@@ -541,10 +542,41 @@ static inline bool ompi_op_is_valid(ompi_op_t * op, ompi_datatype_t * ddt,
541542
* is not defined to have that operation, it is likely to seg fault.
542543
*/
543544
static inline void ompi_op_reduce(ompi_op_t * op, void *source,
544-
void *target, int count,
545+
void *target, size_t full_count,
545546
ompi_datatype_t * dtype)
546547
{
547548
MPI_Fint f_dtype, f_count;
549+
int count = full_count;
550+
551+
/*
552+
* If the full_count is > INT_MAX then we need to call the reduction op
553+
* in iterations of counts <= INT_MAX since it has an `int *len`
554+
* parameter.
555+
*
556+
* Note: When we add BigCount support then we can distinguish between
557+
* a reduction operation with `int *len` and `MPI_Count *len`. At which
558+
* point we can avoid this loop.
559+
*/
560+
if( OPAL_UNLIKELY(full_count > INT_MAX) ) {
561+
size_t done_count = 0, shift;
562+
int iter_count;
563+
ptrdiff_t ext, lb;
564+
565+
ompi_datatype_get_extent(dtype, &lb, &ext);
566+
567+
while(done_count < full_count) {
568+
if(done_count + INT_MAX > full_count) {
569+
iter_count = full_count - done_count;
570+
} else {
571+
iter_count = INT_MAX;
572+
}
573+
shift = done_count * ext;
574+
// Recurse one level in iterations of 'int'
575+
ompi_op_reduce(op, (char*)source + shift, (char*)target + shift, iter_count, dtype);
576+
done_count += iter_count;
577+
}
578+
return;
579+
}
548580

549581
/*
550582
* Call the reduction function. Two dimensions: a) if both the op

0 commit comments

Comments
 (0)