Skip to content

Incorrect codegen for PermuteVar32x16x2 #90001

@MineCake147E

Description

@MineCake147E

Description

When the first parameter of PermuteVar32x16x2 sits in memory, the RyuJIT seemingly incorrectly assumes that it can swap the lower and upper of PermuteVar32x16x2.

using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;
using System.Runtime.CompilerServices;

namespace Avx512CodegenBugMinimalReproduction
{
    internal class Program
    {
        [MethodImpl(MethodImplOptions.AggressiveOptimization)]
        public static Vector512<ushort> PermuteVar32x16x2Test(Vector512<ushort> left, ushort right)
        {
            var r8w = right;
            var zmm0 = left;
            var zmm1 = Vector512.CreateScalarUnsafe(r8w);
            var zmm2 = Vector512.Create((ushort)1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
            return Avx512BW.PermuteVar32x16x2(zmm0, zmm2, zmm1);
        }

        static void Main(string[] args)
        {
            Console.WriteLine($"AVX512-BW Supported: {Avx512BW.IsSupported}");
            var expected = Vector512.Create((ushort)1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
            Vector512<ushort> actual = PermuteVar32x16x2Test(Vector512.Create(ushort.MinValue, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 32);
            Console.WriteLine($"Expected: {expected}");
            Console.WriteLine($"Actual: {actual}");
        }
    }
}

The codegen for PermuteVar32x16x2Test looks like (using VS2022 disassembly):

00007FFAE7A26E50  vzeroupper  
            return Avx512BW.PermuteVar32x16x2(zmm0, zmm2, zmm1);
00007FFAE7A26E53  vmovups     zmm0,zmmword ptr [Avx512CodegenBugMinimalReproduction.Program.PermuteVar32x16x2Test(System.Runtime.Intrinsics.Vector512`1<UInt16>, UInt16)+030h (07FFAE7A26E80h)]  
00007FFAE7A26E5D  movzx       eax,r8w  
00007FFAE7A26E61  vmovd       xmm1,eax  
00007FFAE7A26E65  vpermt2w    zmm1,zmm0,zmmword ptr [rdx]  ; zmmword ptr [rdx] must be loaded in advance! Why RyuJIT swapped that?
00007FFAE7A26E6B  vmovups     zmmword ptr [rcx],zmm1  
00007FFAE7A26E71  mov         rax,rcx  
00007FFAE7A26E74  vzeroupper  
00007FFAE7A26E77  ret

Reproduction Steps

using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;
using System.Runtime.CompilerServices;

namespace Avx512CodegenBugMinimalReproduction
{
    internal class Program
    {
        [MethodImpl(MethodImplOptions.AggressiveOptimization)]
        public static Vector512<ushort> PermuteVar32x16x2Test(Vector512<ushort> left, ushort right)
        {
            var r8w = right;
            var zmm0 = left;
            var zmm1 = Vector512.CreateScalarUnsafe(r8w);
            var zmm2 = Vector512.Create((ushort)1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
            return Avx512BW.PermuteVar32x16x2(zmm0, zmm2, zmm1);
        }

        static void Main(string[] args)
        {
            Console.WriteLine($"AVX512-BW Supported: {Avx512BW.IsSupported}");
            var expected = Vector512.Create((ushort)1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
            Vector512<ushort> actual = PermuteVar32x16x2Test(Vector512.Create(ushort.MinValue, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 32);
            Console.WriteLine($"Expected: {expected}");
            Console.WriteLine($"Actual: {actual}");
        }
    }
}

Expected behavior

AVX512-BW Supported: True
Expected: <1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32>
Actual: <1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32>

FYI, without [MethodImpl(MethodImplOptions.AggressiveOptimization)], it generates the expected output.

Actual behavior

AVX512-BW Supported: True
Expected: <1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32>
Actual: <0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4>

Even if the lower and upper have been swapped, the Actual should be zero as vmovd should clear all the upper remaining bits... It's weird.

Regression?

Unknown

Known Workarounds

None

Configuration

  • PC: Custom built PC
    • OS: Windows 11 Home 22H2 (22621.1992)
    • CPU: Intel(R) Xeon(R) w5-2455X 3.19 GHz
    • RAM: 32 GB DDR5 ECC RDIMM
    • Graphics: NVIDIA GeForce GTX1060(6 GB)
    • Storage:
      • OS(C:) : WD_BLACK SN770 1TB

Other information

No response

Metadata

Metadata

Assignees

Labels

area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions