Tasks: don't advance task RNG on task spawn

StefanKarpinski · StefanKarpinski · commit 66147d093ea5 · 2023-03-27T16:28:11.000-04:00
Previously we had this unfortunate behavior: julia> Random.seed!(123) TaskLocalRNG() julia> randn() -0.6457306721039767 julia> Random.seed!(123) TaskLocalRNG() julia> fetch(@async nothing) julia> randn() 0.4922456865251828 In other words: the mere act of spawning a child task affects the parent task's RNG (by advancing it four times). This PR preserves the desirable parts of the previous situation: when seeded, the parent and child RNG streams are reproducible. Moreover, it fixes the undesirable behavior: julia> Random.seed!(123) TaskLocalRNG() julia> randn() -0.6457306721039767 julia> Random.seed!(123) TaskLocalRNG() julia> fetch(@async nothing) julia> randn() -0.6457306721039767 In other words: the parent RNG is unaffected by spawning a child. The design is documented in detail in a comment preceding the jl_rng_split function.
diff --git a/base/sysimg.jl b/base/sysimg.jl
@@ -27,6 +27,7 @@ let
     task.rngState1 = 0x7431eaead385992c
     task.rngState2 = 0x503e1d32781c2608
     task.rngState3 = 0x3a77f7189200c20b
+    task.rngState4 = 0x5502376d099035ae
 
     # Stdlibs sorted in dependency, then alphabetical, order by contrib/print_sorted_stdlibs.jl
     # Run with the `--exclude-jlls` option to filter out all JLL packages
diff --git a/src/gc.c b/src/gc.c
@@ -382,9 +382,9 @@ static void jl_gc_run_finalizers_in_list(jl_task_t *ct, arraylist_t *list) JL_NO
     ct->sticky = sticky;
 }
 
-static uint64_t finalizer_rngState[4];
+static uint64_t finalizer_rngState[JL_RNG_SIZE];
 
-void jl_rng_split(uint64_t to[4], uint64_t from[4]) JL_NOTSAFEPOINT;
+void jl_rng_split(uint64_t dst[JL_RNG_SIZE], uint64_t src[JL_RNG_SIZE]) JL_NOTSAFEPOINT;
 
 JL_DLLEXPORT void jl_gc_init_finalizer_rng_state(void)
 {
@@ -413,7 +413,7 @@ static void run_finalizers(jl_task_t *ct)
     jl_atomic_store_relaxed(&jl_gc_have_pending_finalizers, 0);
     arraylist_new(&to_finalize, 0);
 
-    uint64_t save_rngState[4];
+    uint64_t save_rngState[JL_RNG_SIZE];
     memcpy(&save_rngState[0], &ct->rngState[0], sizeof(save_rngState));
     jl_rng_split(ct->rngState, finalizer_rngState);
 
diff --git a/src/jltypes.c b/src/jltypes.c
@@ -2768,7 +2768,7 @@ void jl_init_types(void) JL_GC_DISABLED
                         NULL,
                         jl_any_type,
                         jl_emptysvec,
-                        jl_perm_symsvec(15,
+                        jl_perm_symsvec(16,
                                         "next",
                                         "queue",
                                         "storage",
@@ -2780,11 +2780,12 @@ void jl_init_types(void) JL_GC_DISABLED
                                         "rngState1",
                                         "rngState2",
                                         "rngState3",
+                                        "rngState4",
                                         "_state",
                                         "sticky",
                                         "_isexception",
                                         "priority"),
-                        jl_svec(15,
+                        jl_svec(16,
                                 jl_any_type,
                                 jl_any_type,
                                 jl_any_type,
@@ -2796,6 +2797,7 @@ void jl_init_types(void) JL_GC_DISABLED
                                 jl_uint64_type,
                                 jl_uint64_type,
                                 jl_uint64_type,
+                                jl_uint64_type,
                                 jl_uint8_type,
                                 jl_bool_type,
                                 jl_bool_type,
diff --git a/src/julia.h b/src/julia.h
@@ -1910,6 +1910,8 @@ typedef struct _jl_handler_t {
     size_t world_age;
 } jl_handler_t;
 
+#define JL_RNG_SIZE 5 // xoshiro 4 + splitmix 1
+
 typedef struct _jl_task_t {
     JL_DATA_TYPE
     jl_value_t *next; // invasive linked list for scheduler
@@ -1921,7 +1923,7 @@ typedef struct _jl_task_t {
     jl_function_t *start;
     // 4 byte padding on 32-bit systems
     // uint32_t padding0;
-    uint64_t rngState[4];
+    uint64_t rngState[JL_RNG_SIZE];
     _Atomic(uint8_t) _state;
     uint8_t sticky; // record whether this Task can be migrated to a new thread
     _Atomic(uint8_t) _isexception; // set if `result` is an exception to throw or that we exited with
diff --git a/src/task.c b/src/task.c
@@ -866,28 +866,160 @@ uint64_t jl_genrandom(uint64_t rngState[4]) JL_NOTSAFEPOINT
     return res;
 }
 
-void jl_rng_split(uint64_t to[4], uint64_t from[4]) JL_NOTSAFEPOINT
+/*
+The jl_rng_split function forks a tasks RNG state in a way that is essentially
+guaranteed to avoid collisions between the RNG streams of all forked tasks. The
+main RNG is the xoshiro256++ RNG whose state is stored in rngState[0..3]. There
+is a small internal RNG used for task forking stored in rngState[4]. This state
+is a LCG (linear congruential generator), which is put through four different
+variations of the strongest PCG output function, referred to as PCG-RXS-M-XS-64.
+This output function is invertible: it maps a 64-bit state to 64-bit output, so
+it's not recommended for general purpose RNG usage. In our usage, however, the
+invertability is actually a benefit, and we only use the RNG output internally.
+
+The goal of this function is to perturb the state of each child task's RNG in
+such a way each for an entire tree of tasks spawned starting with a given seed
+in a root task, no two tasks have the same RNG state. Moreover, we want to do
+this in a way that is deterministic and repeatable based the root task's seed
+and the task tree strucutre. The RNG state of a parent task is allowed to alter
+the RNG state of a child task. The mere fact that a child was spawned should not
+alter the RNG output of the parent, but, of course, children spawned after that
+should have distinct RNG states from previously spawned children.
+
+The basic approach is that used by the DotMix [1] and SplitMix [2] systems: each
+task is uniquely identified by a sequence of "pedigree" numbers, indicating
+where in the task tree it was spawned. This vector of pedigree coordinates is
+then reduced to a single value by computing a dot product with a common vector
+of random weights. The DotMix paper provides a proof that this dot product hash
+value (referred to as a "compression function") is collision resistant in the
+sense the the pairwise collision probability of two distinct tasks is 1/N where
+N is the number of possible weight values. Both DotMix and SplitMix use a prime
+value of N because the proof reqires that the difference between two distinct
+pedigree coordinates must be invertible, which is guaranteed by N being prime.
+We take a different approach, however---we limit pedigree coordinates to being
+binary instead: when a task spawns a child, both tasks share the same pedigree
+prefix, which the parent appending a zero and the child appending a one. This
+way a binary vector uniquely identifies each task. Since the coordinates are
+binary, the difference between coordinates in the proof can be taken to always
+be one, which must be invertible, regardless of whether N is prime or not. This
+allows us to compute the dot product using native machine arithmetic, modulo
+2^64 instead of arithmetic in a prime modulus. It also means that when updating
+the dot product incrementally, as described in SplitMix, we don't need to
+multiply weights by anything, since the weight is always zero in the parent (no
+change) and one in the child, which simply entails adding the weight.
+
+We use the internal LCG maintained in rngState[4] to generate random weights:
+each time a child is forked, we update the LCG in both parent and child tasks.
+In the parent, that's all we do; the main RNG state is unchanged, but the next
+time the parent forks a child, the Dot/SplitMix weight used will be different,
+corresponding to being a level deeper in the binary task tree. In the child, we
+use the LCG state to generate four pseduoranodm 64-bit weights (more below) and
+add each weight to one of the xoshiro256 state registers, rngState[0..3]. If we
+assume the main RNG remains unused in all tasks, each register rngState[0..3]
+accumulates a different Dot/SplitMix dot product hash as additional child tasks
+are spawned. Each one is collision resistant with a pairwise collision chance of
+only 1/2^64. Assuming that the four pseduoranodm 64-bit weight streams are
+sufficiently independent, the pairwise collision probability for distinct tasks
+is 1/2^256. If we somehow managed to spawn a quadrillion tasks, the probability
+of a collision would be on the order of 1/10^48. Practically impossible.
+
+What about the random "junk" that's in the xoshiro256 state registers? For a
+tree of tasks spawned with no intervining samples taken from the main RNG, they
+all start with the same junk which doesn't affect the chance of collision; the
+Dot/SplitMix papers suggest adding a random base value to the dot product
+anyway, so we can consider whatever happens to be in the xoshiro256 registers to
+be that. What if the main RNG is used betweeen task forks? In that case, the
+state registers bits are "shuffled" according to the xoshiro256 update
+implemented in jl_genrandom above. The unmodified DotMix collision resistance
+proof obviously doesn't apply then, but we can modify the setup by adding a
+constant difference between the two compression functions and note that we still
+have a 1/N change of the weight value hitting that exact difference. This proves
+collision resistance even between tasks whose dot product hashes are computed
+with arbitrary offsets. Thus we can conclude collision resistance even in the
+face of different starting states of the main RNG. Does this seem too good to be
+true? Perhaps another way of thiking of it will help: suppose we seeded each
+task randomly? Then there would only be a 1/2^256 chance of collision as well.
+So essentially what the proof is telling us is that the dot product construction
+is a good way to randomly seed each task. From that perspective, adding
+arbitrary junk to each random seed doesn't worsen (or improve) its randomness.
+
+The random weights added to rngState[0..3] in successive child tasks are
+generated by applying four different variations on the PCG-RXS-M-XS-64 output
+function to the same 64-bit LCG state. Another obvious way to generate four
+weights would be to iterate the LCG four times per child task split. A reason
+not to do that is that the LCG update is highly linear and there is a risk that
+if the weights are linearly related, they will not provide independent collision
+resistance and instead of a pairwise collision probability of 1/2^256. The PCG
+output function is designed to obfuscate linear relationships between outputs
+and does so quite well, as PCG-RXS-M-XS manaages to pass various statistical RNG
+tests with only 36 bits of state, let alone the 64 bits we're using. Different
+output functions seems like a better way to expand a single state into four
+streams. It also means that the full period of the LCG is available to each
+rngState[0..3] register, rather than just 2^60. Since collision resistance is
+proportional to the number of possible weights, this is a benefit. It's an
+obvious concern to worry about whether the approach of using different output
+functions produces weights that are independent enough to provide full collision
+resistance. We obviously can't test that with 256 bits, but we have tested it
+with a reduced state analogue, using and 8-bit LCG and four variations on the
+PCG-RXS-M-XS-8 output function to generate four 8-bit dot products. This test
+does indicate sufficient independence: one register has collisions at 2^5 while
+four registers only start having collisions at 2^20, which is what we'd expect
+if they were truly independent.
+
+It may also be worth noting that in the specific case where a parent task spawns
+a sequence of child tasks with no intervening usage of its main RNG, then the
+parent and child tasks are actually guaranteed to have different RNG states.
+states. This is true because each of the four PCG streams produces each possible
+2^64 bit output exactly once in the full 2^64 period of the LCG generator. Thus,
+each of up to 2^64 children will be perturbed by different weights. But what
+about the parent colliding with a child? That can only happen if each of the
+rngState[0..3] registers is perturbed by zero, which cannot happen. Consider
+this part of each output function:
+
+    p ^= p >> ((p >> 59) + 5);
+    p *= m[i];
+    p ^= p >> 43
+
+It's easy to check that this maps zero to zero. Thus, if the different `p`
+values are zero in the end, then they all had to be zero at the beginning, which
+is impossible since they each differ from `x` by different additive constants.
+Of course, this doesn't help if the task tree structure is more deeply nested or
+if there are intervinging uses of the main RNG, in which case we're back to
+relying on "merely" 256 bits of collision resistance, but it's nice to know that
+in what is likely the most common case RNG collisions are actually impossible.
+
+[1]: http://supertech.csail.mit.edu/papers/dprng.pdf
+
+[2]: https://gee.cs.oswego.edu/dl/papers/oopsla14.pdf
+*/
+void jl_rng_split(uint64_t dst[JL_RNG_SIZE], uint64_t src[JL_RNG_SIZE])
 {
-    /* TODO: consider a less ad-hoc construction
-       Ideally we could just use the output of the random stream to seed the initial
-       state of the child. Out of an overabundance of caution we multiply with
-       effectively random coefficients, to break possible self-interactions.
-
-       It is not the goal to mix bits -- we work under the assumption that the
-       source is well-seeded, and its output looks effectively random.
-       However, xoshiro has never been studied in the mode where we seed the
-       initial state with the output of another xoshiro instance.
-
-       Constants have nothing up their sleeve:
-       0x02011ce34bce797f == hash(UInt(1))|0x01
-       0x5a94851fb48a6e05 == hash(UInt(2))|0x01
-       0x3688cf5d48899fa7 == hash(UInt(3))|0x01
-       0x867b4bb4c42e5661 == hash(UInt(4))|0x01
-    */
-    to[0] = 0x02011ce34bce797f * jl_genrandom(from);
-    to[1] = 0x5a94851fb48a6e05 * jl_genrandom(from);
-    to[2] = 0x3688cf5d48899fa7 * jl_genrandom(from);
-    to[3] = 0x867b4bb4c42e5661 * jl_genrandom(from);
+    // load and advance the internal LCG state
+    uint64_t x = src[4];
+    src[4] = dst[4] = x * 0xd1342543de82ef95 + 1;
+    // high spectrum multiplier from https://arxiv.org/abs/2001.05304
+
+    static const uint64_t a[4] = {
+        0xe5f8fa077b92a8a8, // random additive offsets...
+        0x7a0cd918958c124d,
+        0x86222f7d388588d4,
+        0xd30cbd35f2b64f52
+    };
+    static const uint64_t m[4] = {
+        0xaef17502108ef2d9, // standard PCG multiplier
+        0xf34026eeb86766af, // random odd multipliers...
+        0x38fd70ad58dd9fbb,
+        0x6677f9b93ab0c04d
+    };
+
+    // PCG-RXS-M-XS output with four variants
+    for (int i = 0; i < 4; i++) {
+        uint64_t p = x + a[i];
+        p ^= p >> ((p >> 59) + 5);
+        p *= m[i];
+        p ^= p >> 43;
+        dst[i] = src[i] + p; // SplitMix dot product
+    }
 }
 
 JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion_future, size_t ssize)
diff --git a/stdlib/Random/src/Xoshiro.jl b/stdlib/Random/src/Xoshiro.jl
@@ -113,12 +113,17 @@ struct TaskLocalRNG <: AbstractRNG end
 TaskLocalRNG(::Nothing) = TaskLocalRNG()
 rng_native_52(::TaskLocalRNG) = UInt64
 
-function setstate!(x::TaskLocalRNG, s0::UInt64, s1::UInt64, s2::UInt64, s3::UInt64)
+function setstate!(
+    x::TaskLocalRNG,
+    s0::UInt64, s1::UInt64, s2::UInt64, s3::UInt64, # xoshiro256 state
+    s4::UInt64 = hash((s0, s1, s2, s3)), # splitmix weight rng state
+)
     t = current_task()
     t.rngState0 = s0
     t.rngState1 = s1
     t.rngState2 = s2
     t.rngState3 = s3
+    t.rngState4 = s4
     x
 end
 
@@ -128,11 +133,11 @@ end
     tmp = s0 + s3
     res = ((tmp << 23) | (tmp >> 41)) + s0
     t = s1 << 17
-    s2 = xor(s2, s0)
-    s3 = xor(s3, s1)
-    s1 = xor(s1, s2)
-    s0 = xor(s0, s3)
-    s2 = xor(s2, t)
+    s2 ⊻= s0
+    s3 ⊻= s1
+    s1 ⊻= s2
+    s0 ⊻= s3
+    s2 ⊻= t
     s3 = s3 << 45 | s3 >> 19
     task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3
     res
@@ -159,7 +164,7 @@ seed!(rng::Union{TaskLocalRNG, Xoshiro}, seed::Integer) = seed!(rng, make_seed(s
 @inline function rand(rng::Union{TaskLocalRNG, Xoshiro}, ::SamplerType{UInt128})
     first = rand(rng, UInt64)
     second = rand(rng,UInt64)
-    second + UInt128(first)<<64
+    second + UInt128(first) << 64
 end
 
 @inline rand(rng::Union{TaskLocalRNG, Xoshiro}, ::SamplerType{Int128}) = rand(rng, UInt128) % Int128
@@ -178,14 +183,14 @@ end
 
 function copy!(dst::TaskLocalRNG, src::Xoshiro)
     t = current_task()
-    t.rngState0, t.rngState1, t.rngState2, t.rngState3 = src.s0, src.s1, src.s2, src.s3
-    dst
+    setstate!(dst, src.s0, src.s1, src.s2, src.s3)
+    return dst
 end
 
 function copy!(dst::Xoshiro, src::TaskLocalRNG)
     t = current_task()
-    dst.s0, dst.s1, dst.s2, dst.s3 = t.rngState0, t.rngState1, t.rngState2, t.rngState3
-    dst
+    setstate!(dst, t.rngState0, t.rngState1, t.rngState2, t.rngState3)
+    return dst
 end
 
 function ==(a::Xoshiro, b::TaskLocalRNG)
diff --git a/stdlib/Random/test/runtests.jl b/stdlib/Random/test/runtests.jl
@@ -1018,3 +1018,50 @@ guardseed() do
         @test f42752(true) === val
     end
 end
+
+@testset "TaskLocalRNG: stream collision smoke test" begin
+    # spawn a trinary tree of tasks:
+    # - spawn three recursive child tasks in each
+    # - generate a random UInt64 in each before, after and between
+    # - collect and count all the generated random values
+    # these should all be distinct across all tasks
+    function gen(d)
+        r = rand(UInt64)
+        vals = [r]
+        if d ≥ 0
+            append!(vals, gent(d - 1))
+            isodd(r) && append!(vals, gent(d - 1))
+            push!(vals, rand(UInt64))
+            iseven(r) && append!(vals, gent(d - 1))
+        end
+        push!(vals, rand(UInt64))
+    end
+    gent(d) = fetch(@async gen(d))
+    seeds = rand(RandomDevice(), UInt64, 5)
+    for seed in seeds
+        Random.seed!(seed)
+        vals = gen(6)
+        @test allunique(vals)
+    end
+end
+
+@testset "TaskLocalRNG: child doesn't affect parent" begin
+    seeds = rand(RandomDevice(), UInt64, 5)
+    for seed in seeds
+        Random.seed!(seed)
+        x = rand(UInt64)
+        y = rand(UInt64)
+        n = 3
+        for i = 1:n
+            Random.seed!(seed)
+            @sync for j = 0:i
+                @async rand(UInt64)
+            end
+            @test x == rand(UInt64)
+            @sync for j = 0:(n-i)
+                @async rand(UInt64)
+            end
+            @test y == rand(UInt64)
+        end
+    end
+end