dotnet · benaadams · Oct 31, 2019 · VSadov · Oct 31, 2019 · benaadams
diff --git a/src/System.Private.CoreLib/shared/System/Buffers/TlsOverPerCoreLockedStacksArrayPool.cs b/src/System.Private.CoreLib/shared/System/Buffers/TlsOverPerCoreLockedStacksArrayPool.cs
@@ -32,10 +32,8 @@ internal sealed partial class TlsOverPerCoreLockedStacksArrayPool<T> : ArrayPool
         /// <summary>The number of buckets (array sizes) in the pool, one for each array length, starting from length 16.</summary>
         private const int NumBuckets = 17; // Utilities.SelectBucketIndex(2*1024*1024)
         /// <summary>Maximum number of per-core stacks to use per array size.</summary>
-        private const int MaxPerCorePerArraySizeStacks = 64; // selected to avoid needing to worry about processor groups
         /// <summary>The maximum number of buffers to store in a bucket's global queue.</summary>
         private const int MaxBuffersPerArraySizePerCore = 8;
-
         /// <summary>The length of arrays stored in the corresponding indices in <see cref="_buckets"/> and <see cref="t_tlsBuckets"/>.</summary>
         private readonly int[] _bucketArraySizes;
         /// <summary>
@@ -335,7 +333,7 @@ private sealed class PerCoreLockedStacks
             public PerCoreLockedStacks()
             {
                 // Create the stacks.  We create as many as there are processors, limited by our max.
-                var stacks = new LockedStack[Math.Min(Environment.ProcessorCount, MaxPerCorePerArraySizeStacks)];
+                var stacks = new LockedStack[PerCoreLockedStacksHelpers.StackCount];
                 for (int i = 0; i < stacks.Length; i++)
                 {
                     stacks[i] = new LockedStack();
@@ -350,7 +348,8 @@ public void TryPush(T[] array)
                 // Try to push on to the associated stack first.  If that fails,
                 // round-robin through the other stacks.
                 LockedStack[] stacks = _perCoreStacks;
-                int index = Thread.GetCurrentProcessorId() % stacks.Length;
+                Debug.Assert(stacks.Length == PerCoreLockedStacksHelpers.StackCount);
+                int index = PerCoreLockedStacksHelpers.GetStackIndexForCurrentProcessor();
                 for (int i = 0; i < stacks.Length; i++)
                 {
                     if (stacks[index].TryPush(array)) return;
@@ -366,7 +365,8 @@ public void TryPush(T[] array)
                 // round-robin through the other stacks.
                 T[]? arr;
                 LockedStack[] stacks = _perCoreStacks;
-                int index = Thread.GetCurrentProcessorId() % stacks.Length;
+                Debug.Assert(stacks.Length == PerCoreLockedStacksHelpers.StackCount);
+                int index = PerCoreLockedStacksHelpers.GetStackIndexForCurrentProcessor();
                 for (int i = 0; i < stacks.Length; i++)
                 {
                     if ((arr = stacks[index].TryPop()) != null) return arr;
@@ -498,4 +498,26 @@ public void Trim(uint tickCount, int id, MemoryPressure pressure, int bucketSize
             }
         }
     }
+
+    internal static class PerCoreLockedStacksHelpers
+    {
+        private const int MaxPerCorePerArraySizeStacks = 64; // selected to avoid needing to worry about processor groups
+        public static int StackCount { get; } = Math.Min(Environment.ProcessorCount, MaxPerCorePerArraySizeStacks);
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static int GetStackIndexForCurrentProcessor()
+        {
+            int procId = Thread.GetCurrentProcessorId();
+            // As ProcessorCount will be a constant at Tier1; this if will be elided.
+            if (Environment.ProcessorCount < MaxPerCorePerArraySizeStacks)
+            {
+                return procId;
+            }
+            else
+            {
+                // As StackCount will be a constant at Tier1; the Jit will drop the mod and use a faster approach.
+                return procId % StackCount;
+            }
+        }
+    }
 }
diff --git a/src/System.Private.CoreLib/shared/System/Threading/Thread.cs b/src/System.Private.CoreLib/shared/System/Threading/Thread.cs
@@ -6,6 +6,7 @@
 using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
 using System.Globalization;
+using System.Runtime.CompilerServices;
 using System.Runtime.ConstrainedExecution;
 using System.Security.Principal;
 
@@ -210,6 +211,55 @@ public static void EndCriticalRegion() { }
         public static void BeginThreadAffinity() { }
         public static void EndThreadAffinity() { }
 
+        // The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of
+        // the t_currentProcessorIdCache are counting down to get it periodically refreshed.
+        // TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar
+        // actions that are likely to result in changing the executing core
+        [ThreadStatic]
+        private static int t_currentProcessorIdCache;
+
+        private const int ProcessorIdCacheShift = 16;
+        private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
+        private const int ProcessorIdRefreshRate = 5000;
+
+        private static int RefreshCurrentProcessorId()
+        {
+            int currentProcessorId = GetCurrentProcessorNumber();
+
+            // If GetCurrentProcessorNumber() is not fully implemented it will return -1.
+            // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
+            // doesn't exist on all platforms.  On those it doesn't exist on, GetCurrentProcessorNumber()
+            // returns -1.  As a fallback in that case and to spread the threads across the buckets
+            // by default, we use the current managed thread ID as a proxy.
+            if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId;
+
+            // Ensure the Id is in range of the ProcessorCount
+            currentProcessorId = (int)((uint)currentProcessorId % (uint)Environment.ProcessorCount);
+
+            Debug.Assert(ProcessorIdRefreshRate <= ProcessorIdCacheCountDownMask);
+
+            // Mask with int.MaxValue to ensure the execution Id is not negative
+            t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | ProcessorIdRefreshRate;
+
+            return currentProcessorId;
+        }
+
+        // Cached processor id used as a hint for which per-core stack to access. It is periodically
+        // refreshed to trail the actual thread core affinity.
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static int GetCurrentProcessorId()
+        {
+            int currentProcessorIdCache = t_currentProcessorIdCache--;
+            if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
+            {
+                return RefreshCurrentProcessorId();
+            }
+
+            int processorId = currentProcessorIdCache >> ProcessorIdCacheShift;
+            Debug.Assert(processorId >= 0 && processorId < Environment.ProcessorCount);
+            return processorId;
+        }
+
         public static LocalDataStoreSlot AllocateDataSlot() => LocalDataStore.AllocateSlot();
         public static LocalDataStoreSlot AllocateNamedDataSlot(string name) => LocalDataStore.AllocateNamedSlot(name);
         public static LocalDataStoreSlot GetNamedDataSlot(string name) => LocalDataStore.GetNamedSlot(name);

diff --git a/src/System.Private.CoreLib/shared/System/Threading/Timer.cs b/src/System.Private.CoreLib/shared/System/Threading/Timer.cs
@@ -41,6 +41,14 @@ internal partial class TimerQueue
 
         public static TimerQueue[] Instances { get; } = CreateTimerQueues();
 
+        public static TimerQueue GetQueueForProcessor()
+        {
+            int index = Thread.GetCurrentProcessorId();
+            Debug.Assert(Environment.ProcessorCount == Instances.Length);
+            Debug.Assert(index >= 0 && index < Instances.Length);
+            return Instances[index];
+        }
+
         private static TimerQueue[] CreateTimerQueues()
         {
             var queues = new TimerQueue[Environment.ProcessorCount];
@@ -437,7 +445,7 @@ internal TimerQueueTimer(TimerCallback timerCallback, object? state, uint dueTim
             {
                 _executionContext = ExecutionContext.Capture();
             }
-            _associatedTimerQueue = TimerQueue.Instances[Thread.GetCurrentProcessorId() % TimerQueue.Instances.Length];
+            _associatedTimerQueue = TimerQueue.GetQueueForProcessor();
 
             // After the following statement, the timer may fire.  No more manipulation of timer state outside of
             // the lock is permitted beyond this point!

diff --git a/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
@@ -487,52 +487,6 @@ private static int CalculateOptimalMaxSpinWaitsPerSpinIteration()
         [MethodImpl(MethodImplOptions.InternalCall)]
         private static extern int GetCurrentProcessorNumber();
 
-        // The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of
-        // the t_currentProcessorIdCache are counting down to get it periodically refreshed.
-        // TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar
-        // actions that are likely to result in changing the executing core
-        [ThreadStatic]
-        private static int t_currentProcessorIdCache;
-
-        private const int ProcessorIdCacheShift = 16;
-        private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
-        private const int ProcessorIdRefreshRate = 5000;
-
-        private static int RefreshCurrentProcessorId()
-        {
-            int currentProcessorId = GetCurrentProcessorNumber();
-
-            // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
-            // doesn't exist on all platforms.  On those it doesn't exist on, GetCurrentProcessorNumber()
-            // returns -1.  As a fallback in that case and to spread the threads across the buckets
-            // by default, we use the current managed thread ID as a proxy.
-            if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId;
-
-            // Add offset to make it clear that it is not guaranteed to be 0-based processor number
-            currentProcessorId += 100;
-
-            Debug.Assert(ProcessorIdRefreshRate <= ProcessorIdCacheCountDownMask);
-
-            // Mask with int.MaxValue to ensure the execution Id is not negative
-            t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | ProcessorIdRefreshRate;
-
-            return currentProcessorId;
-        }
-
-        // Cached processor id used as a hint for which per-core stack to access. It is periodically
-        // refreshed to trail the actual thread core affinity.
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static int GetCurrentProcessorId()
-        {
-            int currentProcessorIdCache = t_currentProcessorIdCache--;
-            if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
-            {
-                return RefreshCurrentProcessorId();
-            }
-
-            return currentProcessorIdCache >> ProcessorIdCacheShift;
-        }
-
         internal void ResetThreadPoolThread()
         {
             // Currently implemented in unmanaged method Thread::InternalReset and