@@ -866,28 +866,160 @@ uint64_t jl_genrandom(uint64_t rngState[4]) JL_NOTSAFEPOINT
866
866
return res ;
867
867
}
868
868
869
- void jl_rng_split (uint64_t to [4 ], uint64_t from [4 ]) JL_NOTSAFEPOINT
869
+ /*
870
+ The jl_rng_split function forks a tasks RNG state in a way that is essentially
871
+ guaranteed to avoid collisions between the RNG streams of all forked tasks. The
872
+ main RNG is the xoshiro256++ RNG whose state is stored in rngState[0..3]. There
873
+ is a small internal RNG used for task forking stored in rngState[4]. This state
874
+ is a LCG (linear congruential generator), which is put through four different
875
+ variations of the strongest PCG output function, referred to as PCG-RXS-M-XS-64.
876
+ This output function is invertible: it maps a 64-bit state to 64-bit output, so
877
+ it's not recommended for general purpose RNG usage. In our usage, however, the
878
+ invertability is actually a benefit, and we only use the RNG output internally.
879
+
880
+ The goal of this function is to perturb the state of each child task's RNG in
881
+ such a way each for an entire tree of tasks spawned starting with a given seed
882
+ in a root task, no two tasks have the same RNG state. Moreover, we want to do
883
+ this in a way that is deterministic and repeatable based the root task's seed
884
+ and the task tree strucutre. The RNG state of a parent task is allowed to alter
885
+ the RNG state of a child task. The mere fact that a child was spawned should not
886
+ alter the RNG output of the parent, but, of course, children spawned after that
887
+ should have distinct RNG states from previously spawned children.
888
+
889
+ The basic approach is that used by the DotMix [1] and SplitMix [2] systems: each
890
+ task is uniquely identified by a sequence of "pedigree" numbers, indicating
891
+ where in the task tree it was spawned. This vector of pedigree coordinates is
892
+ then reduced to a single value by computing a dot product with a common vector
893
+ of random weights. The DotMix paper provides a proof that this dot product hash
894
+ value (referred to as a "compression function") is collision resistant in the
895
+ sense the the pairwise collision probability of two distinct tasks is 1/N where
896
+ N is the number of possible weight values. Both DotMix and SplitMix use a prime
897
+ value of N because the proof reqires that the difference between two distinct
898
+ pedigree coordinates must be invertible, which is guaranteed by N being prime.
899
+ We take a different approach, however---we limit pedigree coordinates to being
900
+ binary instead: when a task spawns a child, both tasks share the same pedigree
901
+ prefix, which the parent appending a zero and the child appending a one. This
902
+ way a binary vector uniquely identifies each task. Since the coordinates are
903
+ binary, the difference between coordinates in the proof can be taken to always
904
+ be one, which must be invertible, regardless of whether N is prime or not. This
905
+ allows us to compute the dot product using native machine arithmetic, modulo
906
+ 2^64 instead of arithmetic in a prime modulus. It also means that when updating
907
+ the dot product incrementally, as described in SplitMix, we don't need to
908
+ multiply weights by anything, since the weight is always zero in the parent (no
909
+ change) and one in the child, which simply entails adding the weight.
910
+
911
+ We use the internal LCG maintained in rngState[4] to generate random weights:
912
+ each time a child is forked, we update the LCG in both parent and child tasks.
913
+ In the parent, that's all we do; the main RNG state is unchanged, but the next
914
+ time the parent forks a child, the Dot/SplitMix weight used will be different,
915
+ corresponding to being a level deeper in the binary task tree. In the child, we
916
+ use the LCG state to generate four pseduoranodm 64-bit weights (more below) and
917
+ add each weight to one of the xoshiro256 state registers, rngState[0..3]. If we
918
+ assume the main RNG remains unused in all tasks, each register rngState[0..3]
919
+ accumulates a different Dot/SplitMix dot product hash as additional child tasks
920
+ are spawned. Each one is collision resistant with a pairwise collision chance of
921
+ only 1/2^64. Assuming that the four pseduoranodm 64-bit weight streams are
922
+ sufficiently independent, the pairwise collision probability for distinct tasks
923
+ is 1/2^256. If we somehow managed to spawn a quadrillion tasks, the probability
924
+ of a collision would be on the order of 1/10^48. Practically impossible.
925
+
926
+ What about the random "junk" that's in the xoshiro256 state registers? For a
927
+ tree of tasks spawned with no intervining samples taken from the main RNG, they
928
+ all start with the same junk which doesn't affect the chance of collision; the
929
+ Dot/SplitMix papers suggest adding a random base value to the dot product
930
+ anyway, so we can consider whatever happens to be in the xoshiro256 registers to
931
+ be that. What if the main RNG is used betweeen task forks? In that case, the
932
+ state registers bits are "shuffled" according to the xoshiro256 update
933
+ implemented in jl_genrandom above. The unmodified DotMix collision resistance
934
+ proof obviously doesn't apply then, but we can modify the setup by adding a
935
+ constant difference between the two compression functions and note that we still
936
+ have a 1/N change of the weight value hitting that exact difference. This proves
937
+ collision resistance even between tasks whose dot product hashes are computed
938
+ with arbitrary offsets. Thus we can conclude collision resistance even in the
939
+ face of different starting states of the main RNG. Does this seem too good to be
940
+ true? Perhaps another way of thiking of it will help: suppose we seeded each
941
+ task randomly? Then there would only be a 1/2^256 chance of collision as well.
942
+ So essentially what the proof is telling us is that the dot product construction
943
+ is a good way to randomly seed each task. From that perspective, adding
944
+ arbitrary junk to each random seed doesn't worsen (or improve) its randomness.
945
+
946
+ The random weights added to rngState[0..3] in successive child tasks are
947
+ generated by applying four different variations on the PCG-RXS-M-XS-64 output
948
+ function to the same 64-bit LCG state. Another obvious way to generate four
949
+ weights would be to iterate the LCG four times per child task split. A reason
950
+ not to do that is that the LCG update is highly linear and there is a risk that
951
+ if the weights are linearly related, they will not provide independent collision
952
+ resistance and instead of a pairwise collision probability of 1/2^256. The PCG
953
+ output function is designed to obfuscate linear relationships between outputs
954
+ and does so quite well, as PCG-RXS-M-XS manaages to pass various statistical RNG
955
+ tests with only 36 bits of state, let alone the 64 bits we're using. Different
956
+ output functions seems like a better way to expand a single state into four
957
+ streams. It also means that the full period of the LCG is available to each
958
+ rngState[0..3] register, rather than just 2^60. Since collision resistance is
959
+ proportional to the number of possible weights, this is a benefit. It's an
960
+ obvious concern to worry about whether the approach of using different output
961
+ functions produces weights that are independent enough to provide full collision
962
+ resistance. We obviously can't test that with 256 bits, but we have tested it
963
+ with a reduced state analogue, using and 8-bit LCG and four variations on the
964
+ PCG-RXS-M-XS-8 output function to generate four 8-bit dot products. This test
965
+ does indicate sufficient independence: one register has collisions at 2^5 while
966
+ four registers only start having collisions at 2^20, which is what we'd expect
967
+ if they were truly independent.
968
+
969
+ It may also be worth noting that in the specific case where a parent task spawns
970
+ a sequence of child tasks with no intervening usage of its main RNG, then the
971
+ parent and child tasks are actually guaranteed to have different RNG states.
972
+ states. This is true because each of the four PCG streams produces each possible
973
+ 2^64 bit output exactly once in the full 2^64 period of the LCG generator. Thus,
974
+ each of up to 2^64 children will be perturbed by different weights. But what
975
+ about the parent colliding with a child? That can only happen if each of the
976
+ rngState[0..3] registers is perturbed by zero, which cannot happen. Consider
977
+ this part of each output function:
978
+
979
+ p ^= p >> ((p >> 59) + 5);
980
+ p *= m[i];
981
+ p ^= p >> 43
982
+
983
+ It's easy to check that this maps zero to zero. Thus, if the different `p`
984
+ values are zero in the end, then they all had to be zero at the beginning, which
985
+ is impossible since they each differ from `x` by different additive constants.
986
+ Of course, this doesn't help if the task tree structure is more deeply nested or
987
+ if there are intervinging uses of the main RNG, in which case we're back to
988
+ relying on "merely" 256 bits of collision resistance, but it's nice to know that
989
+ in what is likely the most common case RNG collisions are actually impossible.
990
+
991
+ [1]: http://supertech.csail.mit.edu/papers/dprng.pdf
992
+
993
+ [2]: https://gee.cs.oswego.edu/dl/papers/oopsla14.pdf
994
+ */
995
+ void jl_rng_split (uint64_t dst [JL_RNG_SIZE ], uint64_t src [JL_RNG_SIZE ])
870
996
{
871
- /* TODO: consider a less ad-hoc construction
872
- Ideally we could just use the output of the random stream to seed the initial
873
- state of the child. Out of an overabundance of caution we multiply with
874
- effectively random coefficients, to break possible self-interactions.
875
-
876
- It is not the goal to mix bits -- we work under the assumption that the
877
- source is well-seeded, and its output looks effectively random.
878
- However, xoshiro has never been studied in the mode where we seed the
879
- initial state with the output of another xoshiro instance.
880
-
881
- Constants have nothing up their sleeve:
882
- 0x02011ce34bce797f == hash(UInt(1))|0x01
883
- 0x5a94851fb48a6e05 == hash(UInt(2))|0x01
884
- 0x3688cf5d48899fa7 == hash(UInt(3))|0x01
885
- 0x867b4bb4c42e5661 == hash(UInt(4))|0x01
886
- */
887
- to [0 ] = 0x02011ce34bce797f * jl_genrandom (from );
888
- to [1 ] = 0x5a94851fb48a6e05 * jl_genrandom (from );
889
- to [2 ] = 0x3688cf5d48899fa7 * jl_genrandom (from );
890
- to [3 ] = 0x867b4bb4c42e5661 * jl_genrandom (from );
997
+ // load and advance the internal LCG state
998
+ uint64_t x = src [4 ];
999
+ src [4 ] = dst [4 ] = x * 0xd1342543de82ef95 + 1 ;
1000
+ // high spectrum multiplier from https://arxiv.org/abs/2001.05304
1001
+
1002
+ static const uint64_t a [4 ] = {
1003
+ 0xe5f8fa077b92a8a8 , // random additive offsets...
1004
+ 0x7a0cd918958c124d ,
1005
+ 0x86222f7d388588d4 ,
1006
+ 0xd30cbd35f2b64f52
1007
+ };
1008
+ static const uint64_t m [4 ] = {
1009
+ 0xaef17502108ef2d9 , // standard PCG multiplier
1010
+ 0xf34026eeb86766af , // random odd multipliers...
1011
+ 0x38fd70ad58dd9fbb ,
1012
+ 0x6677f9b93ab0c04d
1013
+ };
1014
+
1015
+ // PCG-RXS-M-XS output with four variants
1016
+ for (int i = 0 ; i < 4 ; i ++ ) {
1017
+ uint64_t p = x + a [i ];
1018
+ p ^= p >> ((p >> 59 ) + 5 );
1019
+ p *= m [i ];
1020
+ p ^= p >> 43 ;
1021
+ dst [i ] = src [i ] + p ; // SplitMix dot product
1022
+ }
891
1023
}
892
1024
893
1025
JL_DLLEXPORT jl_task_t * jl_new_task (jl_function_t * start , jl_value_t * completion_future , size_t ssize )
0 commit comments