JuliaLang
diff --git a/‎NEWS.md‎
Lines changed: 10 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎doc/src/devdocs/subarrays.md‎
Lines changed: 5 additions & 5 deletions b/‎doc/src/devdocs/subarrays.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎doc/src/manual/performance-tips.md‎
Lines changed: 8 additions & 8 deletions b/‎doc/src/manual/performance-tips.md‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/jltypes.c‎
Lines changed: 12 additions & 4 deletions b/‎src/jltypes.c‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎src/julia.h‎
Lines changed: 4 additions & 0 deletions b/‎src/julia.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/task.c‎
Lines changed: 60 additions & 0 deletions b/‎src/task.c‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎stdlib/LinearAlgebra/test/bunchkaufman.jl‎
Lines changed: 1 addition & 1 deletion b/‎stdlib/LinearAlgebra/test/bunchkaufman.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎stdlib/LinearAlgebra/test/cholesky.jl‎
Lines changed: 1 addition & 1 deletion b/‎stdlib/LinearAlgebra/test/cholesky.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎stdlib/LinearAlgebra/test/dense.jl‎
Lines changed: 5 additions & 5 deletions b/‎stdlib/LinearAlgebra/test/dense.jl‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎stdlib/LinearAlgebra/test/eigen.jl‎
Lines changed: 1 addition & 1 deletion b/‎stdlib/LinearAlgebra/test/eigen.jl‎
Lines changed: 1 addition & 1 deletion
@@ -26,6 +26,8 @@ Language changes
 * Destructuring will no longer mutate values on the left hand side while iterating through values on the right hand side. In the example
   of an array `x`, `x[2], x[1] = x` will now swap the first and second entry of `x`, whereas it used to fill both entries with `x[1]`
   because `x[2]` was mutated during the iteration of `x`. ([#40737])
+* The default random number generator has changed, so all random numbers will be different (even with the
+  same seed) unless an explicit RNG object is used. See the section on the `Random` standard library below.
 
 Compiler/Runtime improvements
 -----------------------------
@@ -39,7 +41,12 @@ Command-line option changes
 
 Multi-threading changes
 -----------------------
+
 * If the `JULIA_NUM_THREADS` environment variable is set to `auto`, then the number of threads will be set to the number of CPU threads ([#38952])
+* Every `Task` object has a local random number generator state, providing reproducible (schedule-independent) execution
+  of parallel simulation code by default. The default generator is also significantly faster in parallel than in
+  previous versions.
+
 
 Build system changes
 --------------------
@@ -130,6 +137,9 @@ Standard library changes
 
 #### Random
 
+* The default random number generator has been changed from Mersenne Twister to [Xoshiro256++](https://prng.di.unimi.it/).
+  The new generator has smaller state, better performance, and superior statistical properties.
+  This generator is the one used for reproducible Task-local randomness.
 
 #### REPL
 
 
@@ -19,14 +19,14 @@ julia> A = rand(2,3,4);
 
 julia> S1 = view(A, :, 1, 2:3)
 2×2 view(::Array{Float64, 3}, :, 1, 2:3) with eltype Float64:
- 0.200586  0.066423
- 0.298614  0.956753
+ 0.166507  0.97397
+ 0.754392  0.831383
 
 julia> S2 = view(A, 1, :, 2:3)
 3×2 view(::Array{Float64, 3}, 1, :, 2:3) with eltype Float64:
- 0.200586  0.066423
- 0.246837  0.646691
- 0.648882  0.276021
+ 0.166507  0.97397
+ 0.518957  0.0705793
+ 0.503714  0.825124
 ```
 ```@meta
 DocTestSetup = nothing
 
@@ -77,12 +77,12 @@ julia> function sum_global()
        end;
 
 julia> @time sum_global()
-  0.009639 seconds (7.36 k allocations: 300.310 KiB, 98.32% compilation time)
-496.84883432553846
+  0.026328 seconds (9.30 k allocations: 416.747 KiB, 36.50% gc time, 99.48% compilation time)
+508.39048990953665
 
 julia> @time sum_global()
-  0.000140 seconds (3.49 k allocations: 70.313 KiB)
-496.84883432553846
+  0.000075 seconds (3.49 k allocations: 70.156 KiB)
+508.39048990953665
 ```
 
 On the first call (`@time sum_global()`) the function gets compiled. (If you've not yet used [`@time`](@ref)
@@ -113,12 +113,12 @@ julia> function sum_arg(x)
        end;
 
 julia> @time sum_arg(x)
-  0.006202 seconds (4.18 k allocations: 217.860 KiB, 99.72% compilation time)
-496.84883432553846
+  0.010298 seconds (4.23 k allocations: 226.021 KiB, 99.81% compilation time)
+508.39048990953665
 
 julia> @time sum_arg(x)
   0.000005 seconds (1 allocation: 16 bytes)
-496.84883432553846
+508.39048990953665
 ```
 
 The 1 allocation seen is from running the `@time` macro itself in global scope. If we instead run
@@ -129,7 +129,7 @@ julia> time_sum(x) = @time sum_arg(x);
 
 julia> time_sum(x)
   0.000001 seconds
-496.84883432553846
+508.39048990953665
 ```
 
 In some situations, your function may need to allocate memory as part of its operation, and this
 
@@ -2524,7 +2524,7 @@ void jl_init_types(void) JL_GC_DISABLED
                         NULL,
                         jl_any_type,
                         jl_emptysvec,
-                        jl_perm_symsvec(10,
+                        jl_perm_symsvec(14,
                                         "next",
                                         "queue",
                                         "storage",
@@ -2534,8 +2534,12 @@ void jl_init_types(void) JL_GC_DISABLED
                                         "code",
                                         "_state",
                                         "sticky",
-                                        "_isexception"),
-                        jl_svec(10,
+                                        "_isexception",
+                                        "rngState0",
+                                        "rngState1",
+                                        "rngState2",
+                                        "rngState3"),
+                        jl_svec(14,
                                 jl_any_type,
                                 jl_any_type,
                                 jl_any_type,
@@ -2545,7 +2549,11 @@ void jl_init_types(void) JL_GC_DISABLED
                                 jl_any_type,
                                 jl_uint8_type,
                                 jl_bool_type,
-                                jl_bool_type),
+                                jl_bool_type,
+                                jl_uint64_type,
+                                jl_uint64_type,
+                                jl_uint64_type,
+                                jl_uint64_type),
                         0, 1, 6);
     jl_value_t *listt = jl_new_struct(jl_uniontype_type, jl_task_type, jl_nothing_type);
     jl_svecset(jl_task_type->types, 0, listt);
 
@@ -1805,6 +1805,10 @@ typedef struct _jl_task_t {
     uint8_t _state;
     uint8_t sticky; // record whether this Task can be migrated to a new thread
     uint8_t _isexception; // set if `result` is an exception to throw or that we exited with
+    uint64_t rngState0; // really rngState[4], but more convenient to split
+    uint64_t rngState1;
+    uint64_t rngState2;
+    uint64_t rngState3;
 
 // hidden state:
     // saved gc stack top for context switches
 
@@ -35,6 +35,7 @@
 #include "julia_internal.h"
 #include "threading.h"
 #include "julia_assert.h"
+#include "support/hashing.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -648,6 +649,63 @@ JL_DLLEXPORT void jl_rethrow_other(jl_value_t *e JL_MAYBE_UNROOTED)
     throw_internal(ct, NULL);
 }
 
+/* This is xoshiro256++ 1.0, used for tasklocal random number generation in julia.
+   This implementation is intended for embedders and internal use by the runtime, and is
+   based on the reference implementation on http://prng.di.unimi.it
+
+   Credits go to Sebastiano Vigna for coming up with this PRNG.
+
+   There is a pure julia implementation in stdlib that tends to be faster when used from
+   within julia, due to inlining and more agressive architecture-specific optimizations.
+*/
+JL_DLLEXPORT uint64_t jl_tasklocal_genrandom(jl_task_t *task) JL_NOTSAFEPOINT
+{
+    uint64_t s0 = task->rngState0;
+    uint64_t s1 = task->rngState1;
+    uint64_t s2 = task->rngState2;
+    uint64_t s3 = task->rngState3;
+
+    uint64_t t = s0 << 17;
+    uint64_t tmp = s0 + s3;
+    uint64_t res = ((tmp << 23) | (tmp >> 41)) + s0;
+    s2 ^= s0;
+    s3 ^= s1;
+    s1 ^= s2;
+    s0 ^= s3;
+    s2 ^= t;
+    s3 = (s3 << 45) | (s3 >> 19);
+
+    task->rngState0 = s0;
+    task->rngState1 = s1;
+    task->rngState2 = s2;
+    task->rngState3 = s3;
+    return res;
+}
+
+void rng_split(jl_task_t *from, jl_task_t *to) JL_NOTSAFEPOINT
+{
+    /* TODO: consider a less ad-hoc construction
+       Ideally we could just use the output of the random stream to seed the initial
+       state of the child. Out of an overabundance of caution we multiply with
+       effectively random coefficients, to break possible self-interactions.
+
+       It is not the goal to mix bits -- we work under the assumption that the
+       source is well-seeded, and its output looks effectively random.
+       However, xoshiro has never been studied in the mode where we seed the
+       initial state with the output of another xoshiro instance.
+
+       Constants have nothing up their sleeve:
+       0x02011ce34bce797f == hash(UInt(1))|0x01
+       0x5a94851fb48a6e05 == hash(UInt(2))|0x01
+       0x3688cf5d48899fa7 == hash(UInt(3))|0x01
+       0x867b4bb4c42e5661 == hash(UInt(4))|0x01
+    */
+    to->rngState0 = 0x02011ce34bce797f * jl_tasklocal_genrandom(from);
+    to->rngState1 = 0x5a94851fb48a6e05 * jl_tasklocal_genrandom(from);
+    to->rngState2 = 0x3688cf5d48899fa7 * jl_tasklocal_genrandom(from);
+    to->rngState3 = 0x867b4bb4c42e5661 * jl_tasklocal_genrandom(from);
+}
+
 JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion_future, size_t ssize)
 {
     jl_task_t *ct = jl_current_task;
@@ -683,6 +741,8 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion
     t->_isexception = 0;
     // Inherit logger state from parent task
     t->logstate = ct->logstate;
+    // Fork task-local random state from parent
+    rng_split(ct, t);
     // there is no active exception handler available on this stack yet
     t->eh = NULL;
     t->sticky = 1;
 
@@ -12,7 +12,7 @@ n = 10
 n1 = div(n, 2)
 n2 = 2*n1
 
-Random.seed!(12343210)
+Random.seed!(12343212)
 
 areal = randn(n,n)/2
 aimg  = randn(n,n)/2
 
@@ -39,7 +39,7 @@ end
     n1 = div(n, 2)
     n2 = 2*n1
 
-    Random.seed!(12343)
+    Random.seed!(12344)
 
     areal = randn(n,n)/2
     aimg  = randn(n,n)/2
 
@@ -15,17 +15,17 @@ n = 10
 n1 = div(n, 2)
 n2 = 2*n1
 
-Random.seed!(1234321)
+Random.seed!(1234323)
 
 @testset "Matrix condition number" begin
     ainit = rand(n,n)
     @testset "for $elty" for elty in (Float32, Float64, ComplexF32, ComplexF64)
         ainit = convert(Matrix{elty}, ainit)
         for a in (copy(ainit), view(ainit, 1:n, 1:n))
-            @test cond(a,1) ≈ 4.837320054554436e+02 atol=0.01
-            @test cond(a,2) ≈ 1.960057871514615e+02 atol=0.01
-            @test cond(a,Inf) ≈ 3.757017682707787e+02 atol=0.01
-            @test cond(a[:,1:5]) ≈ 10.233059337453463 atol=0.01
+            @test cond(a,1) ≈ 50.60863783272028 atol=0.5
+            @test cond(a,2) ≈ 23.059634761613314 atol=0.5
+            @test cond(a,Inf) ≈ 45.12503933120795 atol=0.4
+            @test cond(a[:,1:5]) ≈ 5.719500544258695 atol=0.01
             @test_throws ArgumentError cond(a,3)
         end
     end
 
@@ -11,7 +11,7 @@ n = 10
 n1 = div(n, 2)
 n2 = 2*n1
 
-Random.seed!(1234321)
+Random.seed!(12343219)
 
 areal = randn(n,n)/2
 aimg  = randn(n,n)/2