Closed
Description
Description
Interlocked.Read from 32-bit apps are much slower (100x) on Intel Sapphire Rapids CPUs.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace UnmanagedMemoryStreamPerfTest
{
class Program
{
static bool _Is32BitProcess = true;
static Stream _Stream = null;
static void Main(string[] args)
{
_Is32BitProcess = GetIs32BitProcess();
_Stream = GetStream();
Console.WriteLine("Is 32 bit process: " + _Is32BitProcess);
Stopwatch sw = new Stopwatch();
sw.Start();
MainAsync();
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
Console.ReadLine();
}
static async void MainAsync()
{
//100 threads to make it eaven more obvious
List<Task> tasks = new List<Task>();
for (int i = 0; i < 100; i++)
tasks.Add(DoStuff(GetStream()));
await Task.WhenAll(tasks);
}
static unsafe async Task DoStuff(Stream stream)
{
//contrived, but this is just so we can see it.
//for every XAML file, WPF will read baml streams from the assembly.
//for apps with dense UIs, that can thousands of streams to open a screen
//the problem, 32-bit will be about 3-20 times slower. 200 times slower on high end 2023 AMD and Intel chips due to a throttling feature for the type of lock UnmangedMemoryStream takes.
int count = 1000000;
for (int i = 0; i < count; i++)
{
DoStuffStream(stream);
//DoStuffInterlocked()
//DoStuffInterlockedTreF()
}
}
static unsafe long DoStuffStream(Stream stream)
{
return stream.Length;
}
static unsafe long DoStuffInterlocked()
{
long len = 0;
Interlocked.Read(ref len);
return len;
}
static unsafe long DoStuffInterlockedTreF()
{
// inspriation from Parallel.For() fix //https://devdiv.visualstudio.com/DevDiv/_search?text=969699&type=workitem&pageSize=25&filters=Projects%7BDevDiv%7D
//Read uses Add under the hood, so this should be ok
//gotcha is that 32-bit apps could then only use 2GB streams, limited by 32-bit int
//maybe another Flag _BigStream
long len = 0;
if (_Is32BitProcess /*&& !_BigStream*/)
{
long* indexPtr = &len;
{
Interlocked.Add(ref *(int*)indexPtr, 0);
}
}
else
{
Interlocked.Read(ref len);
}
return len;
}
static unsafe Stream GetStream()
{
//https://learn.microsoft.com/en-us/dotnet/api/system.io.unmanagedmemorystream?view=net-7.0
//this is over kill, but its copy/paste MS sample code...
byte[] message = UnicodeEncoding.Unicode.GetBytes(GetMessage());
// Allocate a block of unmanaged memory and return an IntPtr object.
IntPtr memIntPtr = Marshal.AllocHGlobal(message.Length);
// Get a byte pointer from the IntPtr object.
byte* memBytePtr = (byte*)memIntPtr.ToPointer();
// Create an UnmanagedMemoryStream object using a pointer to unmanaged memory.
UnmanagedMemoryStream writeStream = new UnmanagedMemoryStream(memBytePtr, message.Length, message.Length, FileAccess.Write);
// Write the data.
writeStream.Write(message, 0, message.Length);
// Close the stream.
writeStream.Close();
writeStream.Dispose();
// Create another UnmanagedMemoryStream object using a pointer to unmanaged memory.
UnmanagedMemoryStream readStream = new UnmanagedMemoryStream(memBytePtr, message.Length, message.Length, FileAccess.Read);
// Create a byte array to hold data from unmanaged memory.
byte[] outMessage = new byte[message.Length];
// Read from unmanaged memory to the byte array.
readStream.Read(outMessage, 0, message.Length);
// Close the stream.
//readStream.Close();
//readStream.Dispose();
return readStream;
}
static string GetMessage()
{
return "Here is some data.";
}
static bool GetIs32BitProcess()
{
return IntPtr.Size == 4;
}
}
}
Configuration
.NET 4.8 and 6.0
Regression?
No, new CPU behavior.
Data
WPR of real world apps are available.
Analysis
This was found tuning WPF applications on VDI and seeing a noticeable performance drop opening new screens on these new processors.
Interlocked.Read is used by a lot of APIs and apps. If we can improve perf here, a lot of apps will magically run faster on modern hardware. Otherwise, this issue will have to be addressed on an individual basis.
Many large enterprise apps have 32-bit 3rd-party dependencies, so cannot be rebuilt as 64-bit processes to avoid this perf issue.