Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HPC-GAP: Allow configuration of TLS and improve native TLS performance on macOS #3502

Merged
merged 1 commit into from
Nov 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 131 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ AX_GCC_FUNC_ATTRIBUTE([always_inline])
AX_GCC_FUNC_ATTRIBUTE([format])
AX_GCC_FUNC_ATTRIBUTE([noinline])
AX_GCC_FUNC_ATTRIBUTE([noreturn])
AX_GCC_FUNC_ATTRIBUTE([constructor])
AX_GCC_FUNC_ATTRIBUTE([pure])

dnl compiler builtins
AC_DEFUN([CHECK_COMPILER_BUILTIN],
Expand Down Expand Up @@ -202,6 +204,135 @@ AS_CASE([$with_gc],
)
AC_MSG_RESULT([$with_gc])

dnl
dnl User setting: native thread-local storage (off by default)
fingolfin marked this conversation as resolved.
Show resolved Hide resolved
dnl See src/hpc/tls.h for more details on thread-local storage options.
dnl

AC_ARG_ENABLE([native-tls],
[AS_HELP_STRING([--enable-native-tls],
[use native thread-local storage implementation])],
[enable_native_tls=$enableval],
[enable_native_tls=no])
AC_MSG_CHECKING([whether to use native tls])
fingolfin marked this conversation as resolved.
Show resolved Hide resolved
dnl
dnl Distinguish between cases where we support __thread declarations
dnl and situations where we use a pthread_getspecific() implementation.
dnl Right now, we only do the latter for 64-bit macOS. See src/hpc/tls.h
dnl for details.
dnl
enable_macos_tls_asm=default
AS_IF([[test "x$enable_native_tls" == "xyes"]], [
case "$host" in
x86_64-apple-darwin*)
AC_DEFINE([USE_PTHREAD_TLS], [1], [define as 1 if using pthread_getspecific])
dnl
dnl Test if we can optimize pthread_getspecific() calls via
dnl inline assembly on macOS.
dnl
AC_RUN_IFELSE(
[AC_LANG_SOURCE([[
// The following code also occurs in src/hpc/thread.c and both need to be
// kept in sync.
#include <pthread.h>
#include <string.h>

#define OFFS 0x100
#define END (-1)

int cmpOpCode(unsigned char *code, int *with) {
int result = 0;
while (*with >= 0) {
if (*with == OFFS) {
result = *code;
} else {
if (*code != *with)
return -1;
}
code++;
with++;
}
return result;
}

int main() {
// This is an idea borrowed from Mono. We test if the implementation
// of pthread_getspecific() uses the assembly code below. If that is
// true, we can replace calls to pthread_getspecific() with the
// matching inline assembly, allowing a significant performance boost.
#if defined(__APPLE__) && defined(__x86_64__)
// There are two possible implementations.
static int asm_code[] = {
// movq %gs:[OFFS](,%rdi,8), %rax
// retq
0x65, 0x48, 0x8b, 0x04, 0xfd, OFFS, 0x00, 0x00, 0x00, 0xc3, END
};
static int asm_code2[] = {
// pushq %rbp
// movq %rsp, %rbp
// movq %gs:[OFFS](,%rdi,8),%rax
// popq %rbp
// retq
0x55, 0x48, 0x89, 0xe5, 0x65, 0x48, 0x8b, 0x04, 0xfd, OFFS,
0x00, 0x00, 0x00, 0x5d, 0xc3, END
};
if (cmpOpCode((unsigned char *)pthread_getspecific, asm_code) >= 0) {
return 0;
}
if (cmpOpCode((unsigned char *)pthread_getspecific, asm_code2) >= 0) {
return 0;
}
return 1;
#else
#error FAIL
#endif
}
]])],
dnl => Test succeeded
fingolfin marked this conversation as resolved.
Show resolved Hide resolved
[AC_DEFINE([USE_MACOS_PTHREAD_TLS_ASM], [1],
[define as 1 if macOS assembly implementation supported])
AC_MSG_RESULT([[yes (macOS assembly)]])
enable_macos_tls_asm=yes
],
dnl => Test failed
[AC_MSG_RESULT([[yes (pthread-based)]])
enable_macos_tls_asm=no
],
dnl => Cross-compilation, test impossible
[AC_MSG_RESULT([[yes (pthread-based)]])
enable_macos_tls_asm=no
])
;;
*)
dnl => Any other OS, --with-native-tls specified
AC_DEFINE([USE_NATIVE_TLS], [1], [define as 1 if using native TLS])
AC_MSG_RESULT([yes])
;;
esac
],
[
dnl => --without-native-tls
AC_MSG_RESULT([no])
]
)
dnl
dnl Test if pthread_getspecific() can be overridden as
dnl __attribute__((pure)).
dnl
AC_MSG_CHECKING([[whether pthread_getspecific() can be made pure]])
AC_COMPILE_IFELSE(
[AC_LANG_SOURCE([[
#include <pthread.h>
__attribute__((pure))
void * pthread_getspecific(pthread_key_t key);
]])], [
AC_MSG_RESULT(yes)
AC_DEFINE([ALLOW_PURE_PTHREAD_GETSPECIFIC], [1],
[define as 1 if pthread_getspecific() can be declared pure])
], [
AC_MSG_RESULT(no)
])

dnl
dnl User setting: Debug mode (off by default)
dnl
Expand Down
1 change: 0 additions & 1 deletion src/hpc/guards.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ static ALWAYS_INLINE Bag ImpliedReadGuard(Bag bag)
return bag;
}


static ALWAYS_INLINE int CheckReadAccess(Bag bag)
{
Region *region;
Expand Down
119 changes: 106 additions & 13 deletions src/hpc/thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,95 @@ void UnlockThreadControl(void)
#define MAP_ANONYMOUS MAP_ANON
#endif

#ifndef HAVE_NATIVE_TLS
#ifndef USE_NATIVE_TLS

#ifdef USE_PTHREAD_TLS
static int InitTLSKey;
static pthread_key_t TLSKey;
#ifdef USE_MACOS_PTHREAD_TLS_ASM
static UInt TLSOffset;

// The following code also occurs in configure.ac, and both need to be
// kept in sync.

#define OFFS 0x100
#define END (-1)

int cmpOpCode(unsigned char *code, int *with) {
int result = 0;
while (*with >= 0) {
if (*with == OFFS) {
result = *code;
} else {
if (*code != *with)
return -1;
}
code++;
with++;
}
return result;
}

void FindTLSOffset() {
// This is an idea borrowed from Mono. We test if the implementation
// of pthread_getspecific() uses the assembly code below. If that is
// true, we can replace calls to pthread_getspecific() with the
// matching inline assembly, allowing a significant performance boost.
// There are two possible implementations.
fingolfin marked this conversation as resolved.
Show resolved Hide resolved
static int asm_code[] = {
// movq %gs:[OFFS](,%rdi,8), %rax
// retq
0x65, 0x48, 0x8b, 0x04, 0xfd, OFFS, 0x00, 0x00, 0x00, 0xc3, END
};
static int asm_code2[] = {
// pushq %rbp
// movq %rsp, %rbp
// movq %gs:[OFFS](,%rdi,8),%rax
// popq %rbp
// retq
0x55, 0x48, 0x89, 0xe5, 0x65, 0x48, 0x8b, 0x04, 0xfd, OFFS,
0x00, 0x00, 0x00, 0x5d, 0xc3, END
};
TLSOffset = cmpOpCode((unsigned char *)pthread_getspecific, asm_code);
if (TLSOffset >= 0)
return;
TLSOffset = cmpOpCode((unsigned char *)pthread_getspecific, asm_code2);
if (TLSOffset >= 0)
return;
Panic("Unable to find macOS thread-local storage offset");
}
#endif

static void CreateTLSKey(void)
{
pthread_key_create(&TLSKey, NULL);
#ifdef USE_MACOS_PTHREAD_TLS_ASM
FindTLSOffset();
#endif
InitTLSKey = 1;
}

#ifdef USE_MACOS_PTHREAD_TLS_ASM
UInt GetTLSOffset(void)
{
if (!InitTLSKey) {
CreateTLSKey();
}
return (UInt)TLSKey * sizeof(void *) + TLSOffset;
}
fingolfin marked this conversation as resolved.
Show resolved Hide resolved
#endif
pthread_key_t GetTLSKey(void)
{
if (!InitTLSKey) {
CreateTLSKey();
}
return TLSKey;
}
#endif /* USE_PTHREAD_TLS */

void * AllocateTLS(void)
{
#ifndef USE_PTHREAD_TLS
void * addr;
void * result;
size_t pagesize = getpagesize();
Expand All @@ -126,6 +211,14 @@ void * AllocateTLS(void)
mprotect((char *)result + tlssize, pagesize, PROT_NONE);
#endif
return result;
#else
void * result = pthread_getspecific(GetTLSKey());
if (!result) {
result = malloc(sizeof(GAPState));
pthread_setspecific(GetTLSKey(), result);
}
return result;
#endif /* USE_PTHREAD_TLS */
}

void FreeTLS(void * address)
Expand All @@ -137,7 +230,7 @@ void FreeTLS(void * address)
#endif
}

#endif /* HAVE_NATIVE_TLS */
#endif /* USE_NATIVE_TLS */

#ifndef DISABLE_GC
void AddGCRoots(void)
Expand All @@ -157,7 +250,7 @@ static void RemoveGCRoots(void)
}
#endif /* DISABLE_GC */

#ifndef HAVE_NATIVE_TLS
#if !defined(USE_NATIVE_TLS) && !defined(USE_PTHREAD_TLS)

/* In order to safely use thread-local memory on the main stack, we have
* to work around an idiosyncracy in some virtual memory systems. These
Expand Down Expand Up @@ -191,7 +284,7 @@ static NOINLINE void GrowStack(void)

static NOINLINE void SetupTLS(void)
{
#ifndef HAVE_NATIVE_TLS
#if !defined(USE_NATIVE_TLS) && !defined(USE_PTHREAD_TLS)
GrowStack();
#endif
InitializeTLS();
Expand All @@ -212,7 +305,7 @@ void RunThreadedMain(int (*mainFunction)(int, char **),
int argc,
char ** argv)
{
#ifndef HAVE_NATIVE_TLS
#ifndef USE_NATIVE_TLS
#ifdef STACK_GROWS_UP
#error Upward growing stack not yet supported
#else
Expand Down Expand Up @@ -355,9 +448,8 @@ static void * DispatchThread(void * arg)
Obj RunThread(void (*start)(void *), void * arg)
{
ThreadData * result;
#ifndef HAVE_NATIVE_TLS
#ifndef USE_NATIVE_TLS
void * tls;
size_t pagesize = getpagesize();
#endif
pthread_attr_t thread_attr;
LockThreadControl(1);
Expand All @@ -370,7 +462,7 @@ Obj RunThread(void (*start)(void *), void * arg)
}
result = thread_free_list;
thread_free_list = thread_free_list->next;
#ifndef HAVE_NATIVE_TLS
#ifndef USE_NATIVE_TLS
if (!result->tls)
result->tls = AllocateTLS();
tls = result->tls;
Expand All @@ -395,7 +487,8 @@ Obj RunThread(void (*start)(void *), void * arg)
result->thread_object = NewThreadObject(result - thread_data);
/* set up the thread attribute to support a custom stack in our TLS */
pthread_attr_init(&thread_attr);
#ifndef HAVE_NATIVE_TLS
#if !defined(USE_NATIVE_TLS) && !defined(USE_PTHREAD_TLS)
size_t pagesize = getpagesize();
pthread_attr_setstack(&thread_attr, (char *)tls + pagesize * 2,
TLS_SIZE - pagesize * 2);
#endif
Expand All @@ -411,7 +504,7 @@ Obj RunThread(void (*start)(void *), void * arg)
thread_free_list = result;
UnlockThreadControl();
pthread_attr_destroy(&thread_attr);
#ifndef HAVE_NATIVE_TLS
#ifndef USE_NATIVE_TLS
FreeTLS(tls);
#endif
return (Obj)0;
Expand All @@ -424,15 +517,15 @@ int JoinThread(int id)
{
pthread_t pthread_id;
void (*start)(void *);
#ifndef HAVE_NATIVE_TLS
#ifndef USE_NATIVE_TLS
void * tls;
#endif
if (id < 0 || id >= MAX_THREADS)
return 0;
LockThreadControl(1);
pthread_id = thread_data[id].pthread_id;
start = thread_data[id].start;
#ifndef HAVE_NATIVE_TLS
#ifndef USE_NATIVE_TLS
tls = thread_data[id].tls;
#endif
if (thread_data[id].joined || start == NULL) {
Expand All @@ -451,7 +544,7 @@ int JoinThread(int id)
*/
thread_data[id].start = NULL;
UnlockThreadControl();
#ifndef HAVE_NATIVE_TLS
#ifndef USE_NATIVE_TLS
FreeTLS(tls);
#endif
return 1;
Expand Down
2 changes: 1 addition & 1 deletion src/hpc/thread.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

extern int PreThreadCreation;

#ifndef HAVE_NATIVE_TLS
#ifndef USE_NATIVE_TLS
void *AllocateTLS(void);
void FreeTLS(void *address);
#endif
Expand Down
Loading