-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Closed
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Milestone
Description
Description
When the first parameter of PermuteVar32x16x2 sits in memory, the RyuJIT seemingly incorrectly assumes that it can swap the lower and upper of PermuteVar32x16x2.
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;
using System.Runtime.CompilerServices;
namespace Avx512CodegenBugMinimalReproduction
{
internal class Program
{
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
public static Vector512<ushort> PermuteVar32x16x2Test(Vector512<ushort> left, ushort right)
{
var r8w = right;
var zmm0 = left;
var zmm1 = Vector512.CreateScalarUnsafe(r8w);
var zmm2 = Vector512.Create((ushort)1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
return Avx512BW.PermuteVar32x16x2(zmm0, zmm2, zmm1);
}
static void Main(string[] args)
{
Console.WriteLine($"AVX512-BW Supported: {Avx512BW.IsSupported}");
var expected = Vector512.Create((ushort)1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
Vector512<ushort> actual = PermuteVar32x16x2Test(Vector512.Create(ushort.MinValue, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 32);
Console.WriteLine($"Expected: {expected}");
Console.WriteLine($"Actual: {actual}");
}
}
}The codegen for PermuteVar32x16x2Test looks like (using VS2022 disassembly):
00007FFAE7A26E50 vzeroupper
return Avx512BW.PermuteVar32x16x2(zmm0, zmm2, zmm1);
00007FFAE7A26E53 vmovups zmm0,zmmword ptr [Avx512CodegenBugMinimalReproduction.Program.PermuteVar32x16x2Test(System.Runtime.Intrinsics.Vector512`1<UInt16>, UInt16)+030h (07FFAE7A26E80h)]
00007FFAE7A26E5D movzx eax,r8w
00007FFAE7A26E61 vmovd xmm1,eax
00007FFAE7A26E65 vpermt2w zmm1,zmm0,zmmword ptr [rdx] ; zmmword ptr [rdx] must be loaded in advance! Why RyuJIT swapped that?
00007FFAE7A26E6B vmovups zmmword ptr [rcx],zmm1
00007FFAE7A26E71 mov rax,rcx
00007FFAE7A26E74 vzeroupper
00007FFAE7A26E77 retReproduction Steps
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;
using System.Runtime.CompilerServices;
namespace Avx512CodegenBugMinimalReproduction
{
internal class Program
{
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
public static Vector512<ushort> PermuteVar32x16x2Test(Vector512<ushort> left, ushort right)
{
var r8w = right;
var zmm0 = left;
var zmm1 = Vector512.CreateScalarUnsafe(r8w);
var zmm2 = Vector512.Create((ushort)1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
return Avx512BW.PermuteVar32x16x2(zmm0, zmm2, zmm1);
}
static void Main(string[] args)
{
Console.WriteLine($"AVX512-BW Supported: {Avx512BW.IsSupported}");
var expected = Vector512.Create((ushort)1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
Vector512<ushort> actual = PermuteVar32x16x2Test(Vector512.Create(ushort.MinValue, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 32);
Console.WriteLine($"Expected: {expected}");
Console.WriteLine($"Actual: {actual}");
}
}
}Expected behavior
AVX512-BW Supported: True
Expected: <1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32>
Actual: <1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32>
FYI, without [MethodImpl(MethodImplOptions.AggressiveOptimization)], it generates the expected output.
Actual behavior
AVX512-BW Supported: True
Expected: <1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32>
Actual: <0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4>
Even if the lower and upper have been swapped, the Actual should be zero as vmovd should clear all the upper remaining bits... It's weird.
Regression?
Unknown
Known Workarounds
None
Configuration
- PC: Custom built PC
- OS: Windows 11 Home 22H2 (22621.1992)
- CPU: Intel(R) Xeon(R) w5-2455X 3.19 GHz
- RAM: 32 GB DDR5 ECC RDIMM
- Graphics: NVIDIA GeForce GTX1060(6 GB)
- Storage:
- OS(C:) : WD_BLACK SN770 1TB
Other information
No response
JulieLeeMSFT
Metadata
Metadata
Assignees
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI