Skip to content

Commit 6911fa0

Browse files
committed
Address review
1 parent 9e94aa4 commit 6911fa0

File tree

7 files changed

+82
-81
lines changed

7 files changed

+82
-81
lines changed

NEWS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ Command-line option changes
2121
Multi-threading changes
2222
-----------------------
2323

24+
- New functions `Threads.atomic_fence_heavy` and `Threads.atoimc_fence_light` provide support for
25+
asymmetric atomic fences, speeding up atomic synchronization where one side of the synchronization
26+
runs significantly less often than the other ([#60311]).
27+
2428
Build system changes
2529
--------------------
2630

base/atomics.jl

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ export
1010
atomic_add!, atomic_sub!,
1111
atomic_and!, atomic_nand!, atomic_or!, atomic_xor!,
1212
atomic_max!, atomic_min!,
13-
atomic_fence
13+
atomic_fence, atomic_fence_light, atomic_fence_heavy
1414

1515
"""
1616
Threads.Atomic{T}
@@ -334,23 +334,23 @@ atomic_fence() = Core.Intrinsics.atomic_fence(:sequentially_consistent, :system)
334334
"""
335335
Threads.atomic_fence_light()
336336
337-
This is a read-optimized sequential-consistency memory fence.
338-
On supported operating systems and architectures, this fence is cheaper
339-
than `Threads.atomic_fence()`, but synchronizes only with
340-
[`atomic_fence_heavy`](@ref) calls from other threads.
337+
Insert the light side of an asymmetric sequential-consistency memory fence.
338+
Asymmetric memory fences are useful in scenarios where one side of the
339+
synchronization runs significantly less often than the other side. Use this
340+
function on the side that runs often and [`atomic_fence_heavy`](@ref) on the
341+
side that runs rarely.
342+
343+
On supported operating systems and architectures this fence is cheaper than
344+
`Threads.atomic_fence()`, but synchronizes only with [`atomic_fence_heavy`](@ref)
345+
calls from other threads.
341346
"""
342347
atomic_fence_light() = Core.Intrinsics.atomic_fence(:sequentially_consistent, :singlethread)
343348

344349
"""
345350
Threads.atomic_fence_heavy()
346351
347-
This is a write-optimized sequential-consistency memory fence.
348-
This fence is significantly more expensive than `Threads.atomic_fence`.
349-
It generally requires a system call and a full interprocessor interrupt
350-
to all other processors in the system. It synchronizes with both
351-
[`atomic_fence_light`](@ref) and [`atomic_fence`](@ref) calls from other threads.
352-
353-
For further details, see the Linux `membarrier` syscall or the Windows
354-
`FlushProcessWriteBuffers` API.
352+
Insert the heavy side of an asymmetric sequential-consistency memory fence.
353+
Use this function on the side that runs rarely.
354+
See [`atomic_fence_light`](@ref) for more details.
355355
"""
356356
atomic_fence_heavy() = ccall(:jl_membarrier, Cvoid, ())

doc/src/base/multi-threading.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ Base.Threads.atomic_xor!
5050
Base.Threads.atomic_max!
5151
Base.Threads.atomic_min!
5252
Base.Threads.atomic_fence
53+
Base.Threads.atomic_fence_heavy
54+
Base.Threads.atomic_fence_light
5355
```
5456

5557
## ccall using a libuv threadpool (Experimental)

src/runtime_intrinsics.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -626,13 +626,13 @@ JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order_sym, jl_value_t *sync
626626
{
627627
JL_TYPECHK(fence, symbol, order_sym);
628628
JL_TYPECHK(fence, symbol, syncscope_sym);
629+
enum jl_memory_order order = jl_get_atomic_order_checked((jl_sym_t*)order_sym, 1, 1);
629630
if ((jl_sym_t*)syncscope_sym == jl_singlethread_sym) {
630631
asm volatile ("" : : : "memory");
631632
return jl_nothing;
632633
} else if ((jl_sym_t*)syncscope_sym != jl_system_sym) {
633634
jl_error("atomic_fence: invalid syncscope");
634635
}
635-
enum jl_memory_order order = jl_get_atomic_order_checked((jl_sym_t*)order_sym, 1, 1);
636636
if (order > jl_memory_order_monotonic)
637637
jl_fence();
638638
return jl_nothing;

src/signals-mach.c

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -898,34 +898,32 @@ JL_DLLEXPORT void jl_profile_stop_timer(void)
898898
// Copyright (c) .NET Foundation and Contributors
899899
// MIT LICENSE
900900
JL_DLLEXPORT void jl_membarrier(void) {
901-
mach_msg_type_number_t cThreads;
902-
thread_act_t *pThreads;
903-
kern_return_t machret = task_threads(mach_task_self(), &pThreads, &cThreads);
904-
HANDLE_MACH_ERROR("task_threads()", machret);
905-
906901
uintptr_t sp;
907902
uintptr_t registerValues[128];
903+
kern_return_t machret;
908904

909905
// Iterate through each of the threads in the list.
910-
for (mach_msg_type_number_t i = 0; i < cThreads; i++)
911-
{
906+
int nthreads = jl_atomic_load_acquire(&jl_n_threads);
907+
for (int tid = 0; tid < nthreads; tid++) {
908+
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
909+
thread_act_t thread = pthread_mach_thread_np(ptls2->system_id);
912910
if (__builtin_available (macOS 10.14, iOS 12, tvOS 9, *))
913911
{
914912
// Request the threads pointer values to force the thread to emit a memory barrier
915913
size_t registers = 128;
916-
machret = thread_get_register_pointer_values(pThreads[i], &sp, &registers, registerValues);
914+
machret = thread_get_register_pointer_values(thread, &sp, &registers, registerValues);
917915
}
918916
else
919917
{
920918
// fallback implementation for older OS versions
921919
#if defined(_CPU_X86_64_)
922920
x86_thread_state64_t threadState;
923921
mach_msg_type_number_t count = x86_THREAD_STATE64_COUNT;
924-
machret = thread_get_state(pThreads[i], x86_THREAD_STATE64, (thread_state_t)&threadState, &count);
922+
machret = thread_get_state(thread, x86_THREAD_STATE64, (thread_state_t)&threadState, &count);
925923
#elif defined(_CPU_AARCH64_)
926924
arm_thread_state64_t threadState;
927925
mach_msg_type_number_t count = ARM_THREAD_STATE64_COUNT;
928-
machret = thread_get_state(pThreads[i], ARM_THREAD_STATE64, (thread_state_t)&threadState, &count);
926+
machret = thread_get_state(thread, ARM_THREAD_STATE64, (thread_state_t)&threadState, &count);
929927
#else
930928
#error Unexpected architecture
931929
#endif
@@ -935,11 +933,5 @@ JL_DLLEXPORT void jl_membarrier(void) {
935933
{
936934
HANDLE_MACH_ERROR("thread_get_register_pointer_values()", machret);
937935
}
938-
939-
machret = mach_port_deallocate(mach_task_self(), pThreads[i]);
940-
HANDLE_MACH_ERROR("mach_port_deallocate()", machret);
941936
}
942-
// Deallocate the thread list now we're done with it.
943-
machret = vm_deallocate(mach_task_self(), (vm_address_t)pThreads, cThreads * sizeof(thread_act_t));
944-
HANDLE_MACH_ERROR("vm_deallocate()", machret);
945937
}

src/signals-unix.c

Lines changed: 44 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1275,48 +1275,15 @@ JL_DLLEXPORT int jl_repl_raise_sigtstp(void)
12751275
return raise(SIGTSTP);
12761276
}
12771277

1278-
// Linux and FreeBSD have compatible membarrier support
1279-
#if defined(_OS_LINUX_) || defined(_OS_FREEBSD_)
1280-
#if defined(_OS_LINUX_)
1281-
# include <sys/syscall.h>
1282-
# if defined(__has_include)
1283-
# if __has_include(<linux/membarrier.h>)
1284-
# include <linux/membarrier.h>
1285-
# define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__)
1286-
# else
1287-
# if defined(__NR_membarrier)
1288-
enum membarrier_cmd {
1289-
MEMBARRIER_CMD_QUERY = 0,
1290-
MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3),
1291-
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
1292-
};
1293-
# define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__)
1294-
# else
1295-
# warning "Missing linux kernel headers for membarrier syscall, support disabled"
1296-
# define membarrier(...) -ENOSYS
1297-
# endif
1298-
# endif
1299-
# else
1300-
# include <linux/membarrier.h>
1301-
# endif
1302-
#elif defined(_OS_FREEBSD_)
1303-
# include <sys/param.h>
1304-
# if __FreeBSD_version >= 1401500
1305-
# include <sys/membarrier.h>
1306-
# else
1307-
# define MEMBARRIER_CMD_QUERY 0x00
1308-
# define MEMBARRIER_CMD_PRIVATE_EXPEDITED 0x08
1309-
# define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED 0x10
1310-
# define membarrier(...) -ENOSYS
1311-
# endif
1312-
#endif
1313-
1278+
#if !defined(_OS_DARWIN_)
13141279
// Implementation of the `mprotect` based membarrier fallback.
13151280
// This is a common fallback based on the observation that `mprotect` happens to
13161281
// issue the necessary memory barriers. However, there is no spec that
1317-
// guarantees this behavior, and indeed AArch64 macos does not. However, we
1318-
// only use it as a fallback here for older versions of Linux and FreeBSD where
1319-
// we know that it happens to work.
1282+
// guarantees this behavior, and indeed AArch64 Darwin does not (so we don't use it
1283+
// there). However, we only use it as a fallback here for older versions of
1284+
// Linux and FreeBSD where we know that it happens to work. We also use it as a
1285+
// fallback for unknown Unix systems under the assumption that it will work,
1286+
// but this is not guaranteed.
13201287
static pthread_mutex_t mprotect_barrier_lock = PTHREAD_MUTEX_INITIALIZER;
13211288
static _Atomic(uint64_t) *mprotect_barrier_page = NULL;
13221289
static void jl_init_mprotect_membarrier(void)
@@ -1335,7 +1302,7 @@ static void jl_init_mprotect_membarrier(void)
13351302
}
13361303
result = mlock(mprotect_barrier_page, pagesize);
13371304
if (result != 0) {
1338-
jl_safe_printf("fatal: failed to mlock barrier page.\n");
1305+
jl_safe_printf("fatal: failed to mlock barrier page (try increasing RLIMIT_MEMLOCK with `ulimit -l`).\n");
13391306
abort();
13401307
}
13411308
}
@@ -1349,15 +1316,43 @@ static void jl_mprotect_membarrier(void)
13491316
int result = pthread_mutex_lock(&mprotect_barrier_lock);
13501317
assert(result == 0);
13511318
size_t pagesize = jl_getpagesize();
1352-
result = mprotect(mprotect_barrier_page, pagesize, PROT_NONE);
1319+
result = mprotect(mprotect_barrier_page, pagesize, PROT_READ | PROT_WRITE);
13531320
jl_atomic_fetch_add_relaxed(mprotect_barrier_page, 1);
13541321
assert(result == 0);
1355-
result = mprotect(mprotect_barrier_page, pagesize, PROT_READ | PROT_WRITE);
1322+
result = mprotect(mprotect_barrier_page, pagesize, PROT_NONE);
13561323
assert(result == 0);
13571324
result = pthread_mutex_unlock(&mprotect_barrier_lock);
13581325
assert(result == 0);
13591326
(void)result;
13601327
}
1328+
#endif
1329+
1330+
// Linux and FreeBSD have compatible membarrier support
1331+
#if defined(_OS_LINUX_) || defined(_OS_FREEBSD_)
1332+
#if defined(_OS_LINUX_)
1333+
# include <sys/syscall.h>
1334+
# if defined(__NR_membarrier)
1335+
enum membarrier_cmd {
1336+
MEMBARRIER_CMD_QUERY = 0,
1337+
MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3),
1338+
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
1339+
};
1340+
# define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__)
1341+
# else
1342+
# warning "Missing linux kernel headers for membarrier syscall, support disabled"
1343+
# define membarrier(...) (errno = ENOSYS, -1)
1344+
# endif
1345+
#elif defined(_OS_FREEBSD_)
1346+
# include <sys/param.h>
1347+
# if __FreeBSD_version >= 1401500
1348+
# include <sys/membarrier.h>
1349+
# else
1350+
# define MEMBARRIER_CMD_QUERY 0x00
1351+
# define MEMBARRIER_CMD_PRIVATE_EXPEDITED 0x08
1352+
# define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED 0x10
1353+
# define membarrier(...) (errno = ENOSYS, -1)
1354+
# endif
1355+
#endif
13611356

13621357
// Implementation of `jl_membarrier`
13631358
enum membarrier_implementation {
@@ -1391,11 +1386,17 @@ JL_DLLEXPORT void jl_membarrier(void) {
13911386
}
13921387
if (impl == MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER) {
13931388
int ret = membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0);
1394-
assert(ret);
1389+
assert(ret == 0);
13951390
(void)ret;
13961391
} else {
13971392
assert(impl == MEMBARRIER_IMPLEMENTATION_MPROTECT);
13981393
jl_mprotect_membarrier();
13991394
}
14001395
}
1396+
#elif !defined(_OS_DARWIN_)
1397+
JL_DLLEXPORT void jl_membarrier(void) {
1398+
if (!mprotect_barrier_page)
1399+
jl_init_mprotect_membarrier();
1400+
jl_mprotect_membarrier();
1401+
}
14011402
#endif

test/threads_exec.jl

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -465,8 +465,8 @@ end
465465
test_fence()
466466

467467
# Test asymmetric thread fences
468-
const asymmetric_test_count = 200_000
469468
struct AsymmetricFenceTestData
469+
n::Int
470470
x::AtomicMemory{Int}
471471
y::AtomicMemory{Int}
472472
read_x::AtomicMemory{Int}
@@ -482,23 +482,25 @@ function test_asymmetric_fence(data::AsymmetricFenceTestData, cond1, cond2, thre
482482
else
483483
@atomic :monotonic data.y[it] = 1
484484
Threads.atomic_fence_light()
485-
@atomic :monotonic data.read_x[it] = data.x[it]
485+
@atomic :monotonic data.read_x[it] = @atomic :monotonic data.x[it]
486486
notify(cond1)
487487
wait(cond2)
488488
end
489489
end
490-
function test_asymmetric_fence(data, cond1, cond2, threadid)
491-
for i = 1:asymmetric_test_count
490+
function test_asymmetric_fence(data::AsymmetricFenceTestData, cond1, cond2, threadid)
491+
for i = 1:data.n
492492
test_asymmetric_fence(data, cond1, cond2, threadid, i)
493493
end
494494
end
495495
function test_asymmetric_fence()
496+
asymmetric_test_count = 200_000
496497
cond1 = Threads.Event(true)
497498
cond2 = Threads.Event(true)
498-
data = AsymmetricFenceTestData(AtomicMemory{Int}(undef, asymmetric_test_count),
499-
AtomicMemory{Int}(undef, asymmetric_test_count),
500-
AtomicMemory{Int}(undef, asymmetric_test_count),
501-
AtomicMemory{Int}(undef, asymmetric_test_count))
499+
data = AsymmetricFenceTestData(asymmetric_test_count,
500+
AtomicMemory{Int}(undef, asymmetric_test_count),
501+
AtomicMemory{Int}(undef, asymmetric_test_count),
502+
AtomicMemory{Int}(undef, asymmetric_test_count),
503+
AtomicMemory{Int}(undef, asymmetric_test_count))
502504
for i = 1:asymmetric_test_count
503505
@atomic :monotonic data.x[i] = 0
504506
@atomic :monotonic data.y[i] = 0

0 commit comments

Comments
 (0)