Skip to content

Commit dc9768b

Browse files
authored
Merge pull request #13029 from tvegas1/oshmem_base_exchange.v4.1.x
oshmem/shmem: Allocate and exchange base segment address beforehand
2 parents 1176ac1 + f58ea7f commit dc9768b

File tree

6 files changed

+190
-28
lines changed

6 files changed

+190
-28
lines changed

oshmem/mca/memheap/base/memheap_base_select.c

Lines changed: 142 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
#include "oshmem/mca/sshmem/base/base.h"
2727
#include "ompi/util/timings.h"
2828

29+
#include <sys/mman.h>
30+
2931
mca_memheap_base_config_t mca_memheap_base_config = {
3032
.device_nic_mem_seg_size = 0
3133
};
@@ -107,6 +109,136 @@ static size_t _memheap_size(void)
107109
return (size_t) memheap_align(oshmem_shmem_info_env.symmetric_heap_size);
108110
}
109111

112+
static void *memheap_mmap_get(void *hint, size_t size)
113+
{
114+
void *addr;
115+
116+
addr = mmap(hint, size,
117+
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
118+
if (addr == MAP_FAILED) {
119+
return NULL;
120+
}
121+
122+
return addr;
123+
}
124+
125+
static int memheap_exchange_base_address(size_t size, void **address)
126+
{
127+
int nprocs = oshmem_num_procs();
128+
int need_sync = (*address == NULL);
129+
void *base = NULL;
130+
void *ptr = NULL;
131+
int rc, i;
132+
void **bases;
133+
134+
bases = calloc(nprocs, sizeof(*bases));
135+
if (NULL == bases) {
136+
return OSHMEM_ERROR;
137+
}
138+
139+
if (oshmem_my_proc_id() == 0) {
140+
ptr = memheap_mmap_get(NULL, size);
141+
base = ptr;
142+
}
143+
144+
rc = oshmem_shmem_bcast(&base, sizeof(base), 0);
145+
if (OSHMEM_SUCCESS != rc) {
146+
MEMHEAP_ERROR("Failed to exchange allocated vma for base segment "
147+
"(error %d)", rc);
148+
goto out;
149+
}
150+
151+
if (oshmem_my_proc_id() != 0) {
152+
ptr = memheap_mmap_get(base, size);
153+
}
154+
155+
MEMHEAP_VERBOSE(100, "#%d: exchange base address: base %p: %s",
156+
oshmem_my_proc_id(), base,
157+
(base == ptr)? "ok" : "unavailable");
158+
159+
*address = base;
160+
if (need_sync) {
161+
/* They all succeed or fail to allow fallback */
162+
rc = oshmem_shmem_allgather(&ptr, bases, sizeof(ptr));
163+
if (OSHMEM_SUCCESS != rc) {
164+
MEMHEAP_ERROR("Failed to exchange selected vma for base segment "
165+
"(error %d)", rc);
166+
goto out;
167+
}
168+
169+
for (i = 0; i < nprocs; i++) {
170+
if ((NULL == bases[i]) || (bases[i] != base)) {
171+
*address = NULL;
172+
break;
173+
}
174+
}
175+
} else if (ptr != base) {
176+
/* Any failure terminates the rank and others start teardown */
177+
rc = OSHMEM_ERROR;
178+
}
179+
180+
out:
181+
if (((OSHMEM_SUCCESS != rc) || (*address == NULL)) && (ptr != NULL)) {
182+
(void)munmap(ptr, size);
183+
}
184+
185+
free(bases);
186+
return rc;
187+
}
188+
189+
190+
/*
191+
* The returned mca_sshmem_base_start_address value is reserved by using
192+
* mmap() for the expected size.
193+
*/
194+
static int memheap_base_segment_setup(size_t size)
195+
{
196+
int rc;
197+
198+
if ((mca_sshmem_base_start_address == (void *)UINTPTR_MAX) ||
199+
(mca_sshmem_base_start_address == NULL)) {
200+
if (UINTPTR_MAX == 0xFFFFFFFF) {
201+
/**
202+
* if 32 bit we set sshmem_base_start_adress to 0
203+
* to let OS allocate segment automatically
204+
*/
205+
mca_sshmem_base_start_address = NULL;
206+
return OSHMEM_SUCCESS;
207+
}
208+
209+
rc = memheap_exchange_base_address(size, &mca_sshmem_base_start_address);
210+
if (OSHMEM_SUCCESS != rc) {
211+
MEMHEAP_ERROR("Failed to setup base segment address (error %d)", rc);
212+
return rc;
213+
}
214+
215+
if (NULL != mca_sshmem_base_start_address) {
216+
goto done; /* Region is reserved */
217+
}
218+
219+
#if defined(__aarch64__)
220+
mca_sshmem_base_start_address = (void*)0xAB0000000000;
221+
#else
222+
mca_sshmem_base_start_address = (void*)0xFF000000;
223+
#endif
224+
}
225+
226+
if (mca_sshmem_base_start_address != memheap_mmap_get(
227+
mca_sshmem_base_start_address, size)) {
228+
MEMHEAP_ERROR("Failed to create segment address %p/%zu",
229+
mca_sshmem_base_start_address, size);
230+
return OSHMEM_ERROR;
231+
}
232+
233+
done:
234+
if (oshmem_my_proc_id() == 0) {
235+
MEMHEAP_VERBOSE(10, "Using symmetric segment address %p/%zu",
236+
mca_sshmem_base_start_address, size);
237+
}
238+
239+
return OSHMEM_SUCCESS;
240+
}
241+
110242
static memheap_context_t* _memheap_create(void)
111243
{
112244
int rc = OSHMEM_SUCCESS;
@@ -124,13 +256,18 @@ static memheap_context_t* _memheap_create(void)
124256

125257
OPAL_TIMING_ENV_NEXT(timing, "_memheap_size()");
126258

127-
/* Inititialize symmetric area */
128-
if (OSHMEM_SUCCESS == rc) {
129-
rc = mca_memheap_base_alloc_init(&mca_memheap_base_map,
130-
user_size + MEMHEAP_BASE_PRIVATE_SIZE, 0,
131-
"regular_mem");
259+
/* Locate and reserve symmetric area */
260+
rc = memheap_base_segment_setup(user_size + MEMHEAP_BASE_PRIVATE_SIZE);
261+
if (OSHMEM_SUCCESS != rc) {
262+
MEMHEAP_ERROR("Failed to negotiate base segment addres");
263+
return NULL;
132264
}
133265

266+
/* Initialize symmetric area */
267+
rc = mca_memheap_base_alloc_init(&mca_memheap_base_map,
268+
user_size + MEMHEAP_BASE_PRIVATE_SIZE, 0,
269+
"regular_mem");
270+
134271
OPAL_TIMING_ENV_NEXT(timing, "mca_memheap_base_alloc_init()");
135272

136273
/* Initialize atomic symmetric area */

oshmem/mca/memheap/base/memheap_base_static.c

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,17 @@
2323
#include <pthread.h>
2424

2525
static int _check_perms(const char *perm);
26+
static int _check_non_static_segment(const map_segment_t *mem_segs,
27+
int n_segment,
28+
const void *start, const void *end);
2629
static int _check_address(void *start, void **end);
2730
static int _check_pathname(uint64_t inode, const char *pathname);
2831

2932
int mca_memheap_base_static_init(mca_memheap_map_t *map)
3033
{
3134
/* read and parse segments from /proc/self/maps */
3235
int ret = OSHMEM_SUCCESS;
36+
int n_segments = map->n_segments;
3337
uint64_t total_mem = 0;
3438
void* start;
3539
void* end;
@@ -52,14 +56,6 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map)
5256
return OSHMEM_ERROR;
5357
}
5458

55-
#ifdef __linux__
56-
extern unsigned _end;
57-
if (mca_sshmem_base_start_address < (uintptr_t)&_end) {
58-
MEMHEAP_VERBOSE(1, "sshmem base start address is inside data region"
59-
" (%p < %p)", mca_sshmem_base_start_address, &_end);
60-
}
61-
#endif
62-
6359
while (NULL != fgets(line, sizeof(line), fp)) {
6460
if (3 > sscanf(line,
6561
"%llx-%llx %s %llx %s %llx %s",
@@ -75,6 +71,12 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map)
7571
goto out;
7672
}
7773

74+
if (OSHMEM_ERROR == _check_non_static_segment(
75+
map->mem_segs, n_segments,
76+
start, end)) {
77+
continue;
78+
}
79+
7880
if (OSHMEM_ERROR == _check_address(start, &end))
7981
continue;
8082

@@ -136,6 +138,26 @@ static int _check_perms(const char *perms)
136138
return OSHMEM_ERROR;
137139
}
138140

141+
static int _check_non_static_segment(const map_segment_t *mem_segs,
142+
int n_segment,
143+
const void *start, const void *end)
144+
{
145+
int i;
146+
147+
for (i = 0; i < n_segment; i++) {
148+
if ((start <= mem_segs[i].super.va_base) &&
149+
(mem_segs[i].super.va_base < end)) {
150+
MEMHEAP_VERBOSE(100,
151+
"non static segment: %p-%p already exists as %p-%p",
152+
start, end, mem_segs[i].super.va_base,
153+
mem_segs[i].super.va_end);
154+
return OSHMEM_ERROR;
155+
}
156+
}
157+
158+
return OSHMEM_SUCCESS;
159+
}
160+
139161
static int _check_address(void *start, void **end)
140162
{
141163
/* FIXME Linux specific code */
@@ -146,11 +168,9 @@ static int _check_address(void *start, void **end)
146168
/**
147169
* SGI shmem only supports globals&static in main program.
148170
* It does not support them in shared objects or in dlopen()
149-
* (Clarified on PGAS 2011 tutorial)
171+
* (Clarified on PGAS 2011 tutorial).
150172
*
151-
* So ignored any maps that start higher then process _end
152-
* FIXME: make sure we do not register symmetric heap twice
153-
* if we decide to allow shared objects
173+
* So ignored any maps that start higher then process _end.
154174
*/
155175
if ((uintptr_t)start > data_end) {
156176
MEMHEAP_VERBOSE(100,

oshmem/mca/sshmem/base/sshmem_base_open.c

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,7 @@
3131
* globals
3232
*/
3333

34-
/**
35-
* if 32 bit we set sshmem_base_start_adress to 0
36-
* to let OS allocate segment automatically
37-
*/
38-
#if UINTPTR_MAX == 0xFFFFFFFF
39-
void *mca_sshmem_base_start_address = (void*)0;
40-
#elif defined(__aarch64__)
41-
void* mca_sshmem_base_start_address = (void*)0xAB0000000000;
42-
#else
43-
void* mca_sshmem_base_start_address = (void*)0xFF000000;
44-
#endif
34+
void *mca_sshmem_base_start_address = UINTPTR_MAX;
4535

4636
char * mca_sshmem_base_backing_file_dir = NULL;
4737

oshmem/mca/sshmem/sysv/sshmem_sysv_module.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ segment_create(map_segment_t *ds_buf,
169169
}
170170

171171
/* Attach to the segment */
172+
(void)munmap(mca_sshmem_base_start_address, size);
172173
addr = shmat(shmid, (void *) mca_sshmem_base_start_address, 0);
173174
if (addr == (void *) -1L) {
174175
opal_show_help("help-oshmem-sshmem.txt",

oshmem/runtime/oshmem_shmem_exchange.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,15 @@
1616
#include "oshmem/runtime/runtime.h"
1717
#include "oshmem/runtime/params.h"
1818

19+
int oshmem_shmem_bcast(void *buf, int elem_size, int root)
20+
{
21+
int rc;
22+
23+
rc = PMPI_Bcast(buf, elem_size, MPI_BYTE, root, oshmem_comm_world);
24+
25+
return rc;
26+
}
27+
1928
int oshmem_shmem_allgather(void *send_buf, void *rcv_buf, int elem_size)
2029
{
2130
int rc;

oshmem/runtime/runtime.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,11 @@ int oshmem_shmem_finalize(void);
127127
*/
128128
OSHMEM_DECLSPEC int oshmem_shmem_abort(int errcode);
129129

130+
/**
131+
* Broadcast between all PEs
132+
*/
133+
OSHMEM_DECLSPEC int oshmem_shmem_bcast(void *buf, int elem_size, int root);
134+
130135
/**
131136
* Allgather between all PEs
132137
*/

0 commit comments

Comments
 (0)