Skip to content

Interlocked.Read from 32-bit apps are much slower on Intel Sapphire Rapids CPUs #93624

Closed
@TrevorFellman-MSFT

Description

@TrevorFellman-MSFT

Description

Interlocked.Read from 32-bit apps are much slower (100x) on Intel Sapphire Rapids CPUs.

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
using System.Threading.Tasks;

namespace UnmanagedMemoryStreamPerfTest
{
    class Program
    {
        static bool _Is32BitProcess = true;
        static Stream _Stream = null;
        static void Main(string[] args)
        {
            _Is32BitProcess = GetIs32BitProcess();
            _Stream = GetStream();

            Console.WriteLine("Is 32 bit process: " + _Is32BitProcess);
            Stopwatch sw = new Stopwatch();
            sw.Start();

            MainAsync();

            sw.Stop();
            Console.WriteLine(sw.ElapsedMilliseconds);
            Console.ReadLine();

        }

        static async void MainAsync()
        {
            //100 threads to make it eaven more obvious
            List<Task> tasks = new List<Task>();
            for (int i = 0; i < 100; i++)
                tasks.Add(DoStuff(GetStream()));
            await Task.WhenAll(tasks);
        }


        static unsafe async Task DoStuff(Stream stream)
        {
            //contrived, but this is just so we can see it.
            //for every XAML file, WPF will read baml streams from the assembly.
            //for apps with dense UIs, that can thousands of streams to open a screen

             //the problem, 32-bit will be about 3-20 times slower.  200 times slower on high end 2023 AMD and Intel chips due to a throttling feature for the type of lock UnmangedMemoryStream takes.
            int count = 1000000;
            for (int i = 0; i < count; i++)
            {
                DoStuffStream(stream);
                //DoStuffInterlocked()
                //DoStuffInterlockedTreF()
            }

        }

        static unsafe long DoStuffStream(Stream stream)
        {
            return stream.Length;
        }

        static unsafe long DoStuffInterlocked()
        {
            long len = 0;
            Interlocked.Read(ref len);
            return len;
        }

        static unsafe long DoStuffInterlockedTreF()
        {
            // inspriation from Parallel.For() fix  //https://devdiv.visualstudio.com/DevDiv/_search?text=969699&type=workitem&pageSize=25&filters=Projects%7BDevDiv%7D
            //Read uses Add under the hood, so this should be ok

            //gotcha is that 32-bit apps could then only use 2GB streams, limited by 32-bit int
            //maybe another Flag _BigStream

            long len = 0;
            if (_Is32BitProcess  /*&& !_BigStream*/)
            {
                long* indexPtr = &len;
                {
                    Interlocked.Add(ref *(int*)indexPtr, 0);
                }
            }
            else
            {
                Interlocked.Read(ref len);
            }
            return len;
        }

        static unsafe Stream GetStream()
        {
            //https://learn.microsoft.com/en-us/dotnet/api/system.io.unmanagedmemorystream?view=net-7.0
            //this is over kill, but its copy/paste MS sample code...

            byte[] message = UnicodeEncoding.Unicode.GetBytes(GetMessage());

            // Allocate a block of unmanaged memory and return an IntPtr object.	
            IntPtr memIntPtr = Marshal.AllocHGlobal(message.Length);

            // Get a byte pointer from the IntPtr object.
            byte* memBytePtr = (byte*)memIntPtr.ToPointer();

            // Create an UnmanagedMemoryStream object using a pointer to unmanaged memory.
            UnmanagedMemoryStream writeStream = new UnmanagedMemoryStream(memBytePtr, message.Length, message.Length, FileAccess.Write);

            // Write the data.
            writeStream.Write(message, 0, message.Length);

            // Close the stream.
            writeStream.Close();
            writeStream.Dispose();

            // Create another UnmanagedMemoryStream object using a pointer to unmanaged memory.
            UnmanagedMemoryStream readStream = new UnmanagedMemoryStream(memBytePtr, message.Length, message.Length, FileAccess.Read);

            // Create a byte array to hold data from unmanaged memory.
            byte[] outMessage = new byte[message.Length];

            // Read from unmanaged memory to the byte array.
            readStream.Read(outMessage, 0, message.Length);

            // Close the stream.
            //readStream.Close();
            //readStream.Dispose();

            return readStream;
        }

        static string GetMessage()
        {
            return "Here is some data.";
        }


        static bool GetIs32BitProcess()
        {
            return IntPtr.Size == 4;
        }
    }
}

Configuration

.NET 4.8 and 6.0

Regression?

No, new CPU behavior.

Data

WPR of real world apps are available.

Analysis

This was found tuning WPF applications on VDI and seeing a noticeable performance drop opening new screens on these new processors.
Interlocked.Read is used by a lot of APIs and apps. If we can improve perf here, a lot of apps will magically run faster on modern hardware. Otherwise, this issue will have to be addressed on an individual basis.

Many large enterprise apps have 32-bit 3rd-party dependencies, so cannot be rebuilt as 64-bit processes to avoid this perf issue.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions