Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Enable a faster mod/GetCurrentProcessorId #27588

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,8 @@ internal sealed partial class TlsOverPerCoreLockedStacksArrayPool<T> : ArrayPool
/// <summary>The number of buckets (array sizes) in the pool, one for each array length, starting from length 16.</summary>
private const int NumBuckets = 17; // Utilities.SelectBucketIndex(2*1024*1024)
/// <summary>Maximum number of per-core stacks to use per array size.</summary>
private const int MaxPerCorePerArraySizeStacks = 64; // selected to avoid needing to worry about processor groups
/// <summary>The maximum number of buffers to store in a bucket's global queue.</summary>
private const int MaxBuffersPerArraySizePerCore = 8;

/// <summary>The length of arrays stored in the corresponding indices in <see cref="_buckets"/> and <see cref="t_tlsBuckets"/>.</summary>
private readonly int[] _bucketArraySizes;
/// <summary>
Expand Down Expand Up @@ -335,7 +333,7 @@ private sealed class PerCoreLockedStacks
public PerCoreLockedStacks()
{
// Create the stacks. We create as many as there are processors, limited by our max.
var stacks = new LockedStack[Math.Min(Environment.ProcessorCount, MaxPerCorePerArraySizeStacks)];
var stacks = new LockedStack[PerCoreLockedStacksHelpers.StackCount];
for (int i = 0; i < stacks.Length; i++)
{
stacks[i] = new LockedStack();
Expand All @@ -350,7 +348,8 @@ public void TryPush(T[] array)
// Try to push on to the associated stack first. If that fails,
// round-robin through the other stacks.
LockedStack[] stacks = _perCoreStacks;
int index = Thread.GetCurrentProcessorId() % stacks.Length;
Debug.Assert(stacks.Length == PerCoreLockedStacksHelpers.StackCount);
int index = PerCoreLockedStacksHelpers.GetStackIndexForCurrentProcessor();
for (int i = 0; i < stacks.Length; i++)
{
if (stacks[index].TryPush(array)) return;
Expand All @@ -366,7 +365,8 @@ public void TryPush(T[] array)
// round-robin through the other stacks.
T[]? arr;
LockedStack[] stacks = _perCoreStacks;
int index = Thread.GetCurrentProcessorId() % stacks.Length;
Debug.Assert(stacks.Length == PerCoreLockedStacksHelpers.StackCount);
int index = PerCoreLockedStacksHelpers.GetStackIndexForCurrentProcessor();
for (int i = 0; i < stacks.Length; i++)
{
if ((arr = stacks[index].TryPop()) != null) return arr;
Expand Down Expand Up @@ -498,4 +498,26 @@ public void Trim(uint tickCount, int id, MemoryPressure pressure, int bucketSize
}
}
}

internal static class PerCoreLockedStacksHelpers
{
private const int MaxPerCorePerArraySizeStacks = 64; // selected to avoid needing to worry about processor groups
public static int StackCount { get; } = Math.Min(Environment.ProcessorCount, MaxPerCorePerArraySizeStacks);

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int GetStackIndexForCurrentProcessor()
{
int procId = Thread.GetCurrentProcessorId();
// As ProcessorCount will be a constant at Tier1; this if will be elided.
if (Environment.ProcessorCount < MaxPerCorePerArraySizeStacks)
{
return procId;
}
else
{
// As StackCount will be a constant at Tier1; the Jit will drop the mod and use a faster approach.
return procId % StackCount;
}
}
}
}
50 changes: 50 additions & 0 deletions src/System.Private.CoreLib/shared/System/Threading/Thread.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Runtime.CompilerServices;
using System.Runtime.ConstrainedExecution;
using System.Security.Principal;

Expand Down Expand Up @@ -210,6 +211,55 @@ public static void EndCriticalRegion() { }
public static void BeginThreadAffinity() { }
public static void EndThreadAffinity() { }

// The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of
// the t_currentProcessorIdCache are counting down to get it periodically refreshed.
// TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar
// actions that are likely to result in changing the executing core
[ThreadStatic]
private static int t_currentProcessorIdCache;

private const int ProcessorIdCacheShift = 16;
private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
private const int ProcessorIdRefreshRate = 5000;

private static int RefreshCurrentProcessorId()
{
int currentProcessorId = GetCurrentProcessorNumber();

// If GetCurrentProcessorNumber() is not fully implemented it will return -1.
// On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
// doesn't exist on all platforms. On those it doesn't exist on, GetCurrentProcessorNumber()
// returns -1. As a fallback in that case and to spread the threads across the buckets
// by default, we use the current managed thread ID as a proxy.
if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId;

// Ensure the Id is in range of the ProcessorCount
currentProcessorId = (int)((uint)currentProcessorId % (uint)Environment.ProcessorCount);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe in a VM proc Id could be above ProcessorCount. Do we care?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you as to whether its accurate vs in range 0-ProcCount?

The first means a mod is needed for each use; the second a mod every 5000 calls.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another concern is - is this cheap? The number of cores could be pretty odd nowdays.
I generally round up the number of stripes to the next ^2 - so that I could use binary masking which is much cheaper.

In that case some slots may not be used due to the oddness of the proc number (or due to affinity), but that is never a big deal.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also the mod will be changed to not be a mod at Tier1 as the divisor will be a constant.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the second a mod every 5000 calls.

@VSadov has done a number of experiments around this and we believe that this limit should be <50.

On more recent hardware, none of this TLS caching should be needed and this should just call RDPID instruction that is both fastest and most accurate.

Copy link
Member Author

@benaadams benaadams Oct 31, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the exact coreid is important over an in range value it would be better to expose RDPID or similar via intrinsics to expose that? Then you can check the .IsSupported flags, do something different for Arm vs Intel etc

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CurrentProcessorId is platform neutral concept. I do not think we need to have yet another set of hardware intrinsics for it.

it also currently adds 100 to it

I would be ok with dropping this and making this API to return the underlying Processor ID.

Copy link
Member Author

@benaadams benaadams Oct 31, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm happy to push the mod bypasses out into the callsites rather than in GetCurrentProcessorId

CurrentProcessorId is platform neutral concept.

However, my point is it doesn't return a platform neutral value; its platform specific

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could drop the +100 part.
It is not a big problem for reducer though. Until you have 100+ cores in the system any method of reducing - modding, masking, hashing, should work the same.
After 100+ cores, yes, it could be a bit of a problem for those who use masking.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just changed the call sites in #27600 if that is preferred?


Debug.Assert(ProcessorIdRefreshRate <= ProcessorIdCacheCountDownMask);

// Mask with int.MaxValue to ensure the execution Id is not negative
t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | ProcessorIdRefreshRate;

return currentProcessorId;
}

// Cached processor id used as a hint for which per-core stack to access. It is periodically
// refreshed to trail the actual thread core affinity.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int GetCurrentProcessorId()
{
int currentProcessorIdCache = t_currentProcessorIdCache--;
if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
{
return RefreshCurrentProcessorId();
}

int processorId = currentProcessorIdCache >> ProcessorIdCacheShift;
Debug.Assert(processorId >= 0 && processorId < Environment.ProcessorCount);
return processorId;
}

public static LocalDataStoreSlot AllocateDataSlot() => LocalDataStore.AllocateSlot();
public static LocalDataStoreSlot AllocateNamedDataSlot(string name) => LocalDataStore.AllocateNamedSlot(name);
public static LocalDataStoreSlot GetNamedDataSlot(string name) => LocalDataStore.GetNamedSlot(name);
Expand Down
10 changes: 9 additions & 1 deletion src/System.Private.CoreLib/shared/System/Threading/Timer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ internal partial class TimerQueue

public static TimerQueue[] Instances { get; } = CreateTimerQueues();

public static TimerQueue GetQueueForProcessor()
{
int index = Thread.GetCurrentProcessorId();
Debug.Assert(Environment.ProcessorCount == Instances.Length);
Debug.Assert(index >= 0 && index < Instances.Length);
return Instances[index];
}

private static TimerQueue[] CreateTimerQueues()
{
var queues = new TimerQueue[Environment.ProcessorCount];
Expand Down Expand Up @@ -437,7 +445,7 @@ internal TimerQueueTimer(TimerCallback timerCallback, object? state, uint dueTim
{
_executionContext = ExecutionContext.Capture();
}
_associatedTimerQueue = TimerQueue.Instances[Thread.GetCurrentProcessorId() % TimerQueue.Instances.Length];
_associatedTimerQueue = TimerQueue.GetQueueForProcessor();

// After the following statement, the timer may fire. No more manipulation of timer state outside of
// the lock is permitted beyond this point!
Expand Down
46 changes: 0 additions & 46 deletions src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
Original file line number Diff line number Diff line change
Expand Up @@ -487,52 +487,6 @@ private static int CalculateOptimalMaxSpinWaitsPerSpinIteration()
[MethodImpl(MethodImplOptions.InternalCall)]
private static extern int GetCurrentProcessorNumber();

// The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of
// the t_currentProcessorIdCache are counting down to get it periodically refreshed.
// TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar
// actions that are likely to result in changing the executing core
[ThreadStatic]
private static int t_currentProcessorIdCache;

private const int ProcessorIdCacheShift = 16;
private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
private const int ProcessorIdRefreshRate = 5000;

private static int RefreshCurrentProcessorId()
{
int currentProcessorId = GetCurrentProcessorNumber();

// On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
// doesn't exist on all platforms. On those it doesn't exist on, GetCurrentProcessorNumber()
// returns -1. As a fallback in that case and to spread the threads across the buckets
// by default, we use the current managed thread ID as a proxy.
if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId;

// Add offset to make it clear that it is not guaranteed to be 0-based processor number
currentProcessorId += 100;

Debug.Assert(ProcessorIdRefreshRate <= ProcessorIdCacheCountDownMask);

// Mask with int.MaxValue to ensure the execution Id is not negative
t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | ProcessorIdRefreshRate;

return currentProcessorId;
}

// Cached processor id used as a hint for which per-core stack to access. It is periodically
// refreshed to trail the actual thread core affinity.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int GetCurrentProcessorId()
{
int currentProcessorIdCache = t_currentProcessorIdCache--;
if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
{
return RefreshCurrentProcessorId();
}

return currentProcessorIdCache >> ProcessorIdCacheShift;
}

internal void ResetThreadPoolThread()
{
// Currently implemented in unmanaged method Thread::InternalReset and
Expand Down