Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 98 additions & 24 deletions opal/mca/memory/patcher/memory_patcher_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* reserved.
* Copyright (c) 2016-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 IBM Corporation. All rights reserved.
* Copyright (c) 2016-2019 IBM Corporation. All rights reserved.
*
* $COPYRIGHT$
*
Expand Down Expand Up @@ -48,6 +48,9 @@
#if defined(HAVE_LINUX_MMAN_H)
#include <linux/mman.h>
#endif
#if defined(HAVE_SYS_IPC_H)
#include <sys/ipc.h>
#endif

#include "memory_patcher.h"
#undef opal_memory_changed
Expand Down Expand Up @@ -104,15 +107,7 @@ opal_memory_patcher_component_t mca_memory_patcher_component = {
* data. If this can be resolved the two levels can be joined.
*/

/*
* The following block of code is #if 0'ed out because we do not need
* to intercept mmap() any more (mmap() only deals with memory
* protection; it does not invalidate any rcache entries for a given
* region). But if we do someday, this is the code that we'll need.
* It's a little non-trivial, so we might as well keep it (and #if 0
* it out).
*/
#if 0
#if defined (SYS_mmap)

#if defined(HAVE___MMAP) && !HAVE_DECL___MMAP
/* prototype for Apple's internal mmap function */
Expand All @@ -121,12 +116,11 @@ void *__mmap (void *start, size_t length, int prot, int flags, int fd, off_t off

static void *(*original_mmap)(void *, size_t, int, int, int, off_t);

static void *intercept_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset)
static void *_intercept_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset)
{
OPAL_PATCHER_BEGIN;
void *result = 0;

if (prot == PROT_NONE) {
if ((flags & MAP_FIXED) && (start != NULL)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW before this change, mmap(PROT_NONE) was calling release hook, and after this change - it doesn't. Is this the desired behavior?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be okay because previously that entire section was #if 0 so mmap() wasn't being intercepted at all. The PROT_NONE was old code and was #if 0 after a decision a long time ago that mmap PROT_NONE didn't need to be trigger a cache deregistration. So when I re-added mmap() for this commit I just used the UCX code to decide what should be intercepted.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, i guess if flags do not have MAP+FIXED, it will not overwrite an existing mapping anyway

opal_mem_hooks_release_hook (start, length, true);
}

Expand All @@ -137,19 +131,20 @@ static void *intercept_mmap(void *start, size_t length, int prot, int flags, int
#else
result = (void*)(intptr_t) memory_patcher_syscall(SYS_mmap, start, length, prot, flags, fd, offset);
#endif

// I thought we had some issue in the past with the above line for IA32,
// like maybe syscall() wouldn't handle that many arguments. But just now
// I used gcc -m32 and it worked on a recent system. But there's a possibility
// that older ia32 systems may need some other code to make the above syscall.
} else {
result = original_mmap (start, length, prot, flags, fd, offset);
}

OPAL_PATCHER_END;
return result;
}

static void *intercept_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset)
{
OPAL_PATCHER_BEGIN;
void *result = _intercept_mmap (start, length, prot, flags, fd, offset);
OPAL_PATCHER_END;
return result;
}
#endif

#if defined (SYS_munmap)
Expand Down Expand Up @@ -256,6 +251,9 @@ static int _intercept_madvise (void *start, size_t length, int advice)
int result = 0;

if (advice == MADV_DONTNEED ||
#ifdef MADV_FREE
advice == MADV_FREE ||
#endif
#ifdef MADV_REMOVE
advice == MADV_REMOVE ||
#endif
Expand Down Expand Up @@ -341,7 +339,12 @@ static int intercept_brk (void *addr)

#endif

#if defined(SYS_shmdt) && defined(__linux__)
#define HAS_SHMDT (defined(SYS_shmdt) || \
(defined(IPCOP_shmdt) && defined(SYS_ipc)))
#define HAS_SHMAT (defined(SYS_shmat) || \
(defined(IPCOP_shmat) && defined(SYS_ipc)))

#if (HAS_SHMDT || HAS_SHMAT) && defined(__linux__)

#include <stdio.h>
#include <fcntl.h>
Expand Down Expand Up @@ -404,6 +407,68 @@ static size_t memory_patcher_get_shm_seg_size (const void *shmaddr)
return seg_size;
}

static size_t get_shm_size(int shmid)
{
struct shmid_ds ds;
int ret;

ret = shmctl(shmid, IPC_STAT, &ds);
if (ret < 0) {
return 0;
}

return ds.shm_segsz;
}
#endif

#if HAS_SHMAT && defined(__linux__)
static void *(*original_shmat)(int shmid, const void *shmaddr, int shmflg);

static void *_intercept_shmat(int shmid, const void *shmaddr, int shmflg)
{
void *result = 0;

size_t size = get_shm_size(shmid);

if ((shmflg & SHM_REMAP) && (shmaddr != NULL)) {
// I don't really know what REMAP combined with SHM_RND does, so I'll just
// guess it remaps all the way down to the lower attach_addr, and all the
// way up to the original shmaddr+size
uintptr_t attach_addr = (uintptr_t)shmaddr;

if (shmflg & SHM_RND) {
attach_addr -= ((uintptr_t)shmaddr) % SHMLBA;
size += ((uintptr_t)shmaddr) % SHMLBA;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO it should be something like this (i will probably have to fix UCX as well):

            attach_addr -= ((uintptr_t)shmaddr) % SHMLBA;
            size        += ((uintptr_t)shmaddr) % SHMLBA;
            size        += (SHMLBA - (size % SHMLBA)) % SHMLBA;  /* Round size up */

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually, thinking again, we can't really change the size. we got the size from shmctl() so it's the actual segment size and should be aligned to SHMLBA So we probably should not change 'size' at all, just 'attach_addr' - since the segment attach point just moves by %SHMLBA, but the segment size cannot really change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm leaning toward agreeing. I agree the new mapping will have the original 'size' as its size, and it'll only occupy the range (attach_addr ..+original_size).

And when I read the manpage for REMAP I'm sure they intended to write that it replaces any mapping that existed in the address range where the new mapping goes. But they wrote more explicitly that it replaces any mapping in (addr ..+size) eg where the new mapping would go w/o RND in play.

So being logical I'd say with RND any mapping that's in (attach_addr ..+original_size) gets replaced.

I'm still maybe 5% concerned that a pedantic reading of the manpage would say that they specified the exact range that gets replaced, and the above skips a little bit on the right hand side of that range.

Is there any downside other than an unnecessary cache invalidation to my current code that uses the lower of the low ends and the higher of the high ends of the two interpretations of the manpage? My code would be telling you a certain virtual address had been invalidated when it really hadn't.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, let's be pedantic

}
opal_mem_hooks_release_hook ((void*)attach_addr, size, false);
}

if (!original_shmat) {
#if defined(SYS_shmat)
result = memory_patcher_syscall(SYS_shmat, shmid, shmaddr, shmflg);
#else // IPCOP_shmat
unsigned long ret;
ret = memory_patcher_syscall(SYS_ipc, IPCOP_shmat,
shmid, shmflg, &shmaddr, shmaddr);
result = (ret > -(unsigned long)SHMLBA) ? (void *)ret : (void *)shmaddr;
#endif
} else {
result = original_shmat (shmid, shmaddr, shmflg);
}

return result;
}

static void* intercept_shmat (int shmid, const void * shmaddr, int shmflg)
{
OPAL_PATCHER_BEGIN;
void *result = _intercept_shmat (shmid, shmaddr, shmflg);
OPAL_PATCHER_END;
return result;
}
#endif

#if HAS_SHMDT && defined(__linux__)
static int (*original_shmdt) (const void *);

static int _intercept_shmdt (const void *shmaddr)
Expand All @@ -417,7 +482,11 @@ static int _intercept_shmdt (const void *shmaddr)
if (original_shmdt) {
result = original_shmdt (shmaddr);
} else {
#if defined(SYS_shmdt)
result = memory_patcher_syscall (SYS_shmdt, shmaddr);
#else // IPCOP_shmdt
result = memory_patcher_syscall(SYS_ipc, IPCOP_shmdt, 0, 0, 0, shmaddr);
#endif
}

return result;
Expand Down Expand Up @@ -478,9 +547,7 @@ static int patcher_open (void)
/* set memory hooks support level */
opal_mem_hooks_set_support (OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT);

#if 0
/* See above block to see why mmap() functionality is #if 0'ed
out */
#if defined (SYS_mmap)
rc = opal_patcher->patch_symbol ("mmap", (uintptr_t) intercept_mmap, (uintptr_t *) &original_mmap);
if (OPAL_SUCCESS != rc) {
return rc;
Expand Down Expand Up @@ -508,7 +575,14 @@ static int patcher_open (void)
}
#endif

#if defined(SYS_shmdt) && defined(__linux__)
#if HAS_SHMAT && defined(__linux__)
rc = opal_patcher->patch_symbol ("shmat", (uintptr_t) intercept_shmat, (uintptr_t *) &original_shmat);
if (OPAL_SUCCESS != rc) {
return rc;
}
#endif

#if HAS_SHMDT && defined(__linux__)
rc = opal_patcher->patch_symbol ("shmdt", (uintptr_t) intercept_shmdt, (uintptr_t *) &original_shmdt);
if (OPAL_SUCCESS != rc) {
return rc;
Expand Down