Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Add normalized equivalent of YieldProcessor, retune some spin loops #13670

Merged
merged 3 commits into from
Sep 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 75 additions & 23 deletions src/mscorlib/shared/System/Threading/SpinWait.cs
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,26 @@ public struct SpinWait
// numbers may seem fairly arbitrary, but were derived with at least some
// thought in the design document. I fully expect they will need to change
// over time as we gain more experience with performance.
internal const int YIELD_THRESHOLD = 10; // When to switch over to a true yield.
internal const int SLEEP_0_EVERY_HOW_MANY_TIMES = 5; // After how many yields should we Sleep(0)?
internal const int SLEEP_1_EVERY_HOW_MANY_TIMES = 20; // After how many yields should we Sleep(1)?
internal const int YieldThreshold = 10; // When to switch over to a true yield.
private const int Sleep0EveryHowManyYields = 5; // After how many yields should we Sleep(0)?
internal const int DefaultSleep1Threshold = 20; // After how many yields should we Sleep(1) frequently?

/// <summary>
/// A suggested number of spin iterations before doing a proper wait, such as waiting on an event that becomes signaled
/// when the resource becomes available.
/// </summary>
/// <remarks>
/// These numbers were arrived at by experimenting with different numbers in various cases that currently use it. It's
/// only a suggested value and typically works well when the proper wait is something like an event.
///
/// Spinning less can lead to early waiting and more context switching, spinning more can decrease latency but may use
/// up some CPU time unnecessarily. Depends on the situation too, for instance SemaphoreSlim uses double this number
/// because the waiting there is currently a lot more expensive (involves more spinning, taking a lock, etc.). It also
/// depends on the likelihood of the spin being successful and how long the wait would be but those are not accounted
/// for here.
/// </remarks>
internal static readonly int SpinCountforSpinBeforeWait = PlatformHelper.IsSingleProcessor ? 1 : 35;
internal const int Sleep1ThresholdForSpinBeforeWait = 40; // should be greater than SpinCountforSpinBeforeWait

// The number of times we've spun already.
private int _count;
Expand All @@ -81,7 +98,12 @@ public struct SpinWait
/// </summary>
public int Count
{
get { return _count; }
get => _count;
internal set
{
Debug.Assert(value >= 0);
_count = value;
}
}

/// <summary>
Expand All @@ -94,10 +116,7 @@ public int Count
/// On a single-CPU machine, <see cref="SpinOnce"/> always yields the processor. On machines with
/// multiple CPUs, <see cref="SpinOnce"/> may yield after an unspecified number of calls.
/// </remarks>
public bool NextSpinWillYield
{
get { return _count > YIELD_THRESHOLD || PlatformHelper.IsSingleProcessor; }
}
public bool NextSpinWillYield => _count >= YieldThreshold || PlatformHelper.IsSingleProcessor;

/// <summary>
/// Performs a single spin.
Expand All @@ -108,7 +127,27 @@ public bool NextSpinWillYield
/// </remarks>
public void SpinOnce()
{
if (NextSpinWillYield)
SpinOnce(DefaultSleep1Threshold);
}

internal void SpinOnce(int sleep1Threshold)
{
Debug.Assert(sleep1Threshold >= YieldThreshold || PlatformHelper.IsSingleProcessor); // so that NextSpinWillYield behaves as requested

// (_count - YieldThreshold) % 2 == 0: The purpose of this check is to interleave Thread.Yield/Sleep(0) with
// Thread.SpinWait. Otherwise, the following issues occur:
// - When there are no threads to switch to, Yield and Sleep(0) become no-op and it turns the spin loop into a
// busy-spin that may quickly reach the max spin count and cause the thread to enter a wait state, or may
// just busy-spin for longer than desired before a Sleep(1). Completing the spin loop too early can cause
// excessive context switcing if a wait follows, and entering the Sleep(1) stage too early can cause
// excessive delays.
// - If there are multiple threads doing Yield and Sleep(0) (typically from the same spin loop due to
// contention), they may switch between one another, delaying work that can make progress.
if ((
_count >= YieldThreshold &&
(_count >= sleep1Threshold || (_count - YieldThreshold) % 2 == 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: the formatting here reads strangely to me

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's formatted similarly to:

if (a ||
    b)

where

a ==
    (
        c &&
        d
    )

This is how I typically format multi-line expressions, trying to align parentheses and putting each type of expression (&& or ||) separately, one condition per line unless the whole expression fits on one line. What would you suggest instead? I can separate parts of it into locals if you prefer.

) ||
PlatformHelper.IsSingleProcessor)
{
//
// We must yield.
Expand All @@ -125,19 +164,21 @@ public void SpinOnce()
// configured to use the (default) coarse-grained system timer.
//

int yieldsSoFar = (_count >= YIELD_THRESHOLD ? _count - YIELD_THRESHOLD : _count);

if ((yieldsSoFar % SLEEP_1_EVERY_HOW_MANY_TIMES) == (SLEEP_1_EVERY_HOW_MANY_TIMES - 1))
if (_count >= sleep1Threshold)
{
RuntimeThread.Sleep(1);
}
else if ((yieldsSoFar % SLEEP_0_EVERY_HOW_MANY_TIMES) == (SLEEP_0_EVERY_HOW_MANY_TIMES - 1))
{
RuntimeThread.Sleep(0);
}
else
{
RuntimeThread.Yield();
int yieldsSoFar = _count >= YieldThreshold ? (_count - YieldThreshold) / 2 : _count;
if ((yieldsSoFar % Sleep0EveryHowManyYields) == (Sleep0EveryHowManyYields - 1))
{
RuntimeThread.Sleep(0);
}
else
{
RuntimeThread.Yield();
}
}
}
else
Expand All @@ -153,11 +194,24 @@ public void SpinOnce()
// number of spins we are willing to tolerate to reduce delay to the caller,
// since we expect most callers will eventually block anyway.
//
RuntimeThread.SpinWait(4 << _count);
// Also, cap the maximum spin count to a value such that many thousands of CPU cycles would not be wasted doing
// the equivalent of YieldProcessor(), as that that point SwitchToThread/Sleep(0) are more likely to be able to
// allow other useful work to run. Long YieldProcessor() loops can help to reduce contention, but Sleep(1) is
// usually better for that.
//
// RuntimeThread.OptimalMaxSpinWaitsPerSpinIteration:
// - See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value.
//
int n = RuntimeThread.OptimalMaxSpinWaitsPerSpinIteration;
if (_count <= 30 && (1 << _count) < n)
{
n = 1 << _count;
}
RuntimeThread.SpinWait(n);
}

// Finally, increment our spin counter.
_count = (_count == int.MaxValue ? YIELD_THRESHOLD : _count + 1);
_count = (_count == int.MaxValue ? YieldThreshold : _count + 1);
}

/// <summary>
Expand Down Expand Up @@ -299,9 +353,7 @@ internal static int ProcessorCount
/// <summary>
/// Gets whether the current machine has only a single processor.
/// </summary>
internal static bool IsSingleProcessor
{
get { return ProcessorCount == 1; }
}
/// <remarks>This typically does not change on a machine, so it's checked only once.</remarks>
internal static readonly bool IsSingleProcessor = ProcessorCount == 1;
}
}
29 changes: 29 additions & 0 deletions src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ namespace Internal.Runtime.Augments
{
public class RuntimeThread : CriticalFinalizerObject
{
private static int s_optimalMaxSpinWaitsPerSpinIteration;

internal RuntimeThread() { }

public static RuntimeThread Create(ThreadStart start) => new Thread(start);
Expand Down Expand Up @@ -186,6 +188,33 @@ public void DisableComObjectEagerCleanup()
private extern bool JoinInternal(int millisecondsTimeout);

public static void Sleep(int millisecondsTimeout) => Thread.Sleep(millisecondsTimeout);

[DllImport(JitHelpers.QCall)]
[SuppressUnmanagedCodeSecurity]
private static extern int GetOptimalMaxSpinWaitsPerSpinIterationInternal();

/// <summary>
/// Max value to be passed into <see cref="SpinWait(int)"/> for optimal delaying. This value is normalized to be
/// appropriate for the processor.
/// </summary>
internal static int OptimalMaxSpinWaitsPerSpinIteration
{
get
{
if (s_optimalMaxSpinWaitsPerSpinIteration != 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s_optimalMaxSpinWaitsPerSpinIteration [](start = 20, length = 37)

Looks this one can be converted to readonly field initialized with GetOptimalMaxSpinWaitsPerSpinIterationInternal() so we can avoid checking 0 value.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't want do that since the first call would trigger the measurement that takes about 10 ms. Static construction of RuntimeThread probably happens during startup for most apps.

{
return s_optimalMaxSpinWaitsPerSpinIteration;
}

// This is done lazily because the first call to the function below in the process triggers a measurement that
// takes a nontrivial amount of time. See Thread::InitializeYieldProcessorNormalized(), which describes and
// calculates this value.
s_optimalMaxSpinWaitsPerSpinIteration = GetOptimalMaxSpinWaitsPerSpinIterationInternal();
Debug.Assert(s_optimalMaxSpinWaitsPerSpinIteration > 0);
return s_optimalMaxSpinWaitsPerSpinIteration;
}
}

public static void SpinWait(int iterations) => Thread.SpinWait(iterations);
public static bool Yield() => Thread.Yield();

Expand Down
43 changes: 7 additions & 36 deletions src/mscorlib/src/System/Threading/ManualResetEventSlim.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@
//
// =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

using System;
using System.Threading;
using System.Runtime.InteropServices;
using System.Diagnostics;
using System.Diagnostics.Contracts;

Expand Down Expand Up @@ -48,7 +45,6 @@ public class ManualResetEventSlim : IDisposable
{
// These are the default spin counts we use on single-proc and MP machines.
private const int DEFAULT_SPIN_SP = 1;
private const int DEFAULT_SPIN_MP = SpinWait.YIELD_THRESHOLD;

private volatile object m_lock;
// A lock used for waiting and pulsing. Lazily initialized via EnsureLockObjectCreated()
Expand Down Expand Up @@ -193,7 +189,7 @@ public ManualResetEventSlim(bool initialState)
{
// Specify the defualt spin count, and use default spin if we're
// on a multi-processor machine. Otherwise, we won't.
Initialize(initialState, DEFAULT_SPIN_MP);
Initialize(initialState, SpinWait.SpinCountforSpinBeforeWait);
}

/// <summary>
Expand Down Expand Up @@ -563,44 +559,19 @@ public bool Wait(int millisecondsTimeout, CancellationToken cancellationToken)
bNeedTimeoutAdjustment = true;
}

//spin
int HOW_MANY_SPIN_BEFORE_YIELD = 10;
int HOW_MANY_YIELD_EVERY_SLEEP_0 = 5;
int HOW_MANY_YIELD_EVERY_SLEEP_1 = 20;

// Spin
int spinCount = SpinCount;
for (int i = 0; i < spinCount; i++)
var spinner = new SpinWait();
while (spinner.Count < spinCount)
{
spinner.SpinOnce(SpinWait.Sleep1ThresholdForSpinBeforeWait);

if (IsSet)
{
return true;
}

else if (i < HOW_MANY_SPIN_BEFORE_YIELD)
{
if (i == HOW_MANY_SPIN_BEFORE_YIELD / 2)
{
Thread.Yield();
}
else
{
Thread.SpinWait(4 << i);
}
}
else if (i % HOW_MANY_YIELD_EVERY_SLEEP_1 == 0)
{
Thread.Sleep(1);
}
else if (i % HOW_MANY_YIELD_EVERY_SLEEP_0 == 0)
{
Thread.Sleep(0);
}
else
{
Thread.Yield();
}

if (i >= 100 && i % 10 == 0) // check the cancellation token if the user passed a very large spin count
if (spinner.Count >= 100 && spinner.Count % 10 == 0) // check the cancellation token if the user passed a very large spin count
cancellationToken.ThrowIfCancellationRequested();
}

Expand Down
19 changes: 16 additions & 3 deletions src/mscorlib/src/System/Threading/SemaphoreSlim.cs
Original file line number Diff line number Diff line change
Expand Up @@ -342,15 +342,28 @@ public bool Wait(int millisecondsTimeout, CancellationToken cancellationToken)
CancellationTokenRegistration cancellationTokenRegistration = cancellationToken.InternalRegisterWithoutEC(s_cancellationTokenCanceledEventHandler, this);
try
{
// Perf: first spin wait for the count to be positive, but only up to the first planned yield.
// Perf: first spin wait for the count to be positive.
// This additional amount of spinwaiting in addition
// to Monitor.Enter()’s spinwaiting has shown measurable perf gains in test scenarios.
//

// Monitor.Enter followed by Monitor.Wait is much more expensive than waiting on an event as it involves another
// spin, contention, etc. The usual number of spin iterations that would otherwise be used here is doubled to
// lessen that extra expense of doing a proper wait.
int spinCount = SpinWait.SpinCountforSpinBeforeWait * 2;
int sleep1Threshold = SpinWait.Sleep1ThresholdForSpinBeforeWait * 2;

SpinWait spin = new SpinWait();
while (m_currentCount == 0 && !spin.NextSpinWillYield)
while (true)
{
spin.SpinOnce();
spin.SpinOnce(sleep1Threshold);

if (m_currentCount != 0)
{
break;
}
}

// entering the lock and incrementing waiters must not suffer a thread-abort, else we cannot
// clean up m_waitCount correctly, which may lead to deadlock due to non-woken waiters.
try { }
Expand Down
Loading