Ensure Vector<T>.op_Multiply is handled as an intrinsic in appropriate cases #49503

tannergooding · 2021-03-11T20:39:39Z

This resolves #49071 and resolves #30923 by ensuring Vector<T> * T is treated as an intrinsic so it generates the ideal codegen and not a for loop:

BenchmarkDotNet=v0.12.1.1466-nightly, OS=Windows 10.0.19042
AMD Ryzen 9 5950X, 1 CPU, 32 logical and 16 physical cores
.NET SDK=6.0.100-preview.3.21161.17
  [Host]     : .NET 6.0.0 (6.0.21.15916), X64 RyuJIT
  Job-KMJTTY : .NET 6.0.0 (42.42.42.42424), X64 RyuJIT

PowerPlanMode=00000000-0000-0000-0000-000000000000  Arguments=/p:DebugType=portable  Toolchain=CoreRun
IterationTime=250.0000 ms  MaxIterationCount=20  MinIterationCount=15
WarmupCount=1

.NET 5

Method	Mean	Error	StdDev	Median	Min	Max	Gen 0	Gen 1	Gen 2	Allocated
Burgers_3	66.19 ms	0.324 ms	0.287 ms	66.16 ms	65.52 ms	66.53 ms	-	-	-	156 KB

Disassembly .NET 5.0.4 (5.0.421.11614), X64 RyuJIT

; Burgers.GetCalculated3(Int32, Int32, Double, Double, Double, Double[])
       push      r15
       push      r14
       push      r12
       push      rdi
       push      rsi
       push      rbp
       push      rbx
       sub       rsp,50
       vzeroupper
       vmovaps   [rsp+40],xmm6
       vmovaps   [rsp+30],xmm7
       vmovaps   [rsp+20],xmm8
       mov       edi,ecx
       mov       esi,edx
       vmovaps   xmm6,xmm2
       vmovaps   xmm7,xmm3
       mov       rbx,[rsp+0B8]
       mov       edx,esi
       sar       edx,1F
       and       edx,3
       add       edx,esi
       and       edx,0FFFFFFFC
       mov       ecx,esi
       sub       ecx,edx
       mov       edx,ecx
       neg       edx
       lea       ebp,[rdx+rsi+4]
       movsxd    rdx,ebp
       mov       rcx,offset MT_System.Double[]
       call      CORINFO_HELP_NEWARR_1_VC
       mov       r14,rax
       movsxd    rdx,ebp
       mov       rcx,offset MT_System.Double[]
       call      CORINFO_HELP_NEWARR_1_VC
       mov       r15,rax
       mov       r8d,[rbx+8]
       mov       rcx,rbx
       mov       rdx,r15
       call      System.Array.Copy(System.Array, System.Array, Int32)
       vmulsd    xmm0,xmm7,qword ptr [rsp+0B0]
       vdivsd    xmm0,xmm0,xmm6
       vmovsd    xmm1,qword ptr [7FF99D1F1DE8]
       call      System.Math.Pow(Double, Double)
       xor       edx,edx
       test      edi,edi
       jle       near ptr M01_L03
       add       ebp,0FFFFFFFD
       lea       eax,[rsi+0FFFF]
       movsxd    rcx,eax
       add       esi,0FFFFFFFE
       movsxd    r8,esi
M01_L00:
       mov       r9d,1
       cmp       ebp,1
       jle       near ptr M01_L02
       mov       r10d,[r15+8]
       vdivsd    xmm1,xmm7,xmm6
M01_L01:
       cmp       r9d,r10d
       jae       near ptr M01_L05
       lea       ebx,[r9+3]
       cmp       ebx,r10d
       jae       near ptr M01_L05
       vmovupd   ymm2,[r15+r9*8+10]
       lea       r11d,[r9+0FFFF]
       cmp       r11d,r10d
       jae       near ptr M01_L05
       lea       r12d,[r9+2]
       cmp       r12d,r10d
       jae       near ptr M01_L05
       vmovupd   ymm3,[r15+r11*8+10]
       lea       r11d,[r9+1]
       cmp       r11d,r10d
       jae       near ptr M01_L05
       lea       r12d,[r9+4]
       cmp       r12d,r10d
       jae       near ptr M01_L05
       vmovupd   ymm4,[r15+r11*8+10]
       vbroadcastsd ymm5,xmm1
       vmulpd    ymm5,ymm5,ymm2
       vsubpd    ymm8,ymm2,ymm3
       vmulpd    ymm5,ymm5,ymm8
       vsubpd    ymm5,ymm2,ymm5
       vbroadcastsd ymm8,qword ptr [7FF99D1F1DF8]
       vmulpd    ymm2,ymm8,ymm2
       vsubpd    ymm2,ymm4,ymm2
       vaddpd    ymm2,ymm2,ymm3
       vbroadcastsd ymm3,xmm0
       vmulpd    ymm2,ymm3,ymm2
       vaddpd    ymm2,ymm5,ymm2
       mov       r11d,[r14+8]
       cmp       r9d,r11d
       jae       near ptr M01_L06
       cmp       ebx,r11d
       jae       near ptr M01_L07
       vmovupd   [r14+r9*8+10],ymm2
       mov       r9d,r12d
       cmp       r9d,ebp
       jl        near ptr M01_L01
M01_L02:
       mov       r10d,[r15+8]
       cmp       r10d,0
       jbe       near ptr M01_L05
       vmovsd    xmm1,qword ptr [r15+10]
       vmulsd    xmm2,xmm1,xmm7
       vdivsd    xmm2,xmm2,xmm6
       cmp       eax,r10d
       jae       near ptr M01_L05
       vmovsd    xmm3,qword ptr [r15+rcx*8+10]
       vsubsd    xmm4,xmm1,xmm3
       vmulsd    xmm2,xmm2,xmm4
       vsubsd    xmm2,xmm1,xmm2
       cmp       r10d,1
       jbe       near ptr M01_L05
       vmovsd    xmm4,qword ptr [r15+18]
       vmulsd    xmm1,xmm1,qword ptr [7FF99D1F1E08]
       vsubsd    xmm1,xmm4,xmm1
       vaddsd    xmm1,xmm1,xmm3
       vmulsd    xmm1,xmm1,xmm0
       vaddsd    xmm1,xmm2,xmm1
       mov       r11d,[r14+8]
       cmp       r11d,0
       jbe       near ptr M01_L05
       vmovsd    qword ptr [r14+10],xmm1
       vmovsd    xmm1,qword ptr [r15+rcx*8+10]
       vmulsd    xmm2,xmm1,xmm7
       vdivsd    xmm2,xmm2,xmm6
       cmp       esi,r10d
       jae       near ptr M01_L05
       vmovsd    xmm3,qword ptr [r15+r8*8+10]
       vsubsd    xmm4,xmm1,xmm3
       vmulsd    xmm2,xmm2,xmm4
       vsubsd    xmm2,xmm1,xmm2
       vaddsd    xmm1,xmm1,xmm1
       vmovsd    xmm4,qword ptr [r15+10]
       vsubsd    xmm1,xmm4,xmm1
       vaddsd    xmm1,xmm1,xmm3
       vmulsd    xmm1,xmm1,xmm0
       vaddsd    xmm1,xmm2,xmm1
       cmp       eax,r11d
       jae       short M01_L05
       vmovsd    qword ptr [r14+rcx*8+10],xmm1
       inc       edx
       cmp       edx,edi
       jl        short M01_L04
       mov       r15,r14
M01_L03:
       mov       rax,r15
       vmovaps   xmm6,[rsp+40]
       vmovaps   xmm7,[rsp+30]
       vmovaps   xmm8,[rsp+20]
       vzeroupper
       add       rsp,50
       pop       rbx
       pop       rbp
       pop       rsi
       pop       rdi
       pop       r12
       pop       r14
       pop       r15
       ret
M01_L04:
       xchg      r14,r15
       jmp       near ptr M01_L00
M01_L05:
       call      CORINFO_HELP_RNGCHKFAIL
M01_L06:
       call      CORINFO_HELP_THROW_ARGUMENTOUTOFRANGEEXCEPTION
M01_L07:
       call      CORINFO_HELP_THROW_ARGUMENTEXCEPTION
       int       3
; Total bytes of code 672

.NET 6

Method	Mean	Error	StdDev	Median	Min	Max	Gen 0	Gen 1	Gen 2	Allocated
Burgers_3	76.40 ms	0.266 ms	0.249 ms	76.45 ms	75.93 ms	76.72 ms	-	-	-	158 KB

Disassembly .NET 6.0.0 (42.42.42.42424), X64 RyuJIT

; Burgers.GetCalculated3(Int32, Int32, Double, Double, Double, Double[])
       push      r15
       push      r14
       push      r12
       push      rdi
       push      rsi
       push      rbp
       push      rbx
       sub       rsp,60
       vzeroupper
       vmovaps   [rsp+50],xmm6
       vmovaps   [rsp+40],xmm7
       vmovaps   [rsp+30],xmm8
       vmovaps   [rsp+20],xmm9
       mov       edi,ecx
       mov       esi,edx
       vmovaps   xmm6,xmm2
       vmovaps   xmm7,xmm3
       mov       rbx,[rsp+0C8]
       mov       edx,esi
       sar       edx,1F
       and       edx,3
       add       edx,esi
       and       edx,0FFFFFFFC
       mov       ecx,esi
       sub       ecx,edx
       mov       edx,ecx
       neg       edx
       lea       ebp,[rdx+rsi+4]
       movsxd    rdx,ebp
       mov       rcx,offset MT_System.Double[]
       call      CORINFO_HELP_NEWARR_1_VC
       mov       r14,rax
       movsxd    rdx,ebp
       mov       rcx,offset MT_System.Double[]
       call      CORINFO_HELP_NEWARR_1_VC
       mov       r15,rax
       mov       r8d,[rbx+8]
       mov       rcx,rbx
       mov       rdx,r15
       call      System.Array.Copy(System.Array, System.Array, Int32)
       vmulsd    xmm0,xmm7,qword ptr [rsp+0C0]
       vdivsd    xmm0,xmm0,xmm6
       vmovsd    xmm8,qword ptr [7FF99709A800]
       vmovaps   xmm1,xmm8
       call      System.Math.Pow(Double, Double)
       xor       edx,edx
       test      edi,edi
       jle       near ptr M01_L03
       add       ebp,0FFFFFFFD
       lea       eax,[rsi+0FFFF]
       movsxd    rcx,eax
       add       esi,0FFFFFFFE
       movsxd    r8,esi
M01_L00:
       mov       r9d,1
       cmp       ebp,1
       jle       near ptr M01_L02
       mov       r10d,[r15+8]
       vdivsd    xmm1,xmm7,xmm6
M01_L01:
       cmp       r9d,r10d
       jae       near ptr M01_L05
       lea       ebx,[r9+3]
       cmp       ebx,r10d
       jae       near ptr M01_L05
       vmovupd   ymm2,[r15+r9*8+10]
       lea       r11d,[r9+0FFFF]
       cmp       r11d,r10d
       jae       near ptr M01_L05
       lea       r12d,[r9+2]
       cmp       r12d,r10d
       jae       near ptr M01_L05
       vmovupd   ymm3,[r15+r11*8+10]
       lea       r11d,[r9+1]
       cmp       r11d,r10d
       jae       near ptr M01_L05
       lea       r12d,[r9+4]
       cmp       r12d,r10d
       jae       near ptr M01_L05
       vmovupd   ymm4,[r15+r11*8+10]
       vmovaps   xmm5,xmm1
       vbroadcastsd ymm5,xmm5
       vmulpd    ymm5,ymm2,ymm5
       vsubpd    ymm9,ymm2,ymm3
       vmulpd    ymm5,ymm5,ymm9
       vsubpd    ymm5,ymm2,ymm5
       vmovaps   xmm9,xmm8
       vbroadcastsd ymm9,xmm9
       vmulpd    ymm2,ymm9,ymm2
       vsubpd    ymm2,ymm4,ymm2
       vaddpd    ymm2,ymm2,ymm3
       vmovaps   xmm3,xmm0
       vbroadcastsd ymm3,xmm3
       vmulpd    ymm2,ymm3,ymm2
       vaddpd    ymm2,ymm5,ymm2
       mov       r11d,[r14+8]
       cmp       r9d,r11d
       jae       near ptr M01_L06
       cmp       ebx,r11d
       jae       near ptr M01_L07
       vmovupd   [r14+r9*8+10],ymm2
       mov       r9d,r12d
       cmp       r9d,ebp
       jl        near ptr M01_L01
M01_L02:
       mov       r10d,[r15+8]
       test      r10d,r10d
       je        near ptr M01_L05
       vmovsd    xmm1,qword ptr [r15+10]
       vmulsd    xmm2,xmm1,xmm7
       vdivsd    xmm2,xmm2,xmm6
       cmp       eax,r10d
       jae       near ptr M01_L05
       vmovsd    xmm3,qword ptr [r15+rcx*8+10]
       vsubsd    xmm4,xmm1,xmm3
       vmulsd    xmm2,xmm2,xmm4
       vsubsd    xmm2,xmm1,xmm2
       cmp       r10d,1
       jbe       near ptr M01_L05
       vmovsd    xmm4,qword ptr [r15+18]
       vmulsd    xmm1,xmm1,xmm8
       vsubsd    xmm1,xmm4,xmm1
       vaddsd    xmm1,xmm1,xmm3
       vmulsd    xmm1,xmm1,xmm0
       vaddsd    xmm1,xmm2,xmm1
       mov       r11d,[r14+8]
       test      r11d,r11d
       je        near ptr M01_L05
       vmovsd    qword ptr [r14+10],xmm1
       vmovsd    xmm1,qword ptr [r15+rcx*8+10]
       vmulsd    xmm2,xmm1,xmm7
       vdivsd    xmm2,xmm2,xmm6
       cmp       esi,r10d
       jae       near ptr M01_L05
       vmovsd    xmm3,qword ptr [r15+r8*8+10]
       vsubsd    xmm4,xmm1,xmm3
       vmulsd    xmm2,xmm2,xmm4
       vsubsd    xmm2,xmm1,xmm2
       vmovsd    xmm4,qword ptr [r15+10]
       vmulsd    xmm1,xmm1,xmm8
       vsubsd    xmm1,xmm4,xmm1
       vaddsd    xmm1,xmm1,xmm3
       vmulsd    xmm1,xmm1,xmm0
       vaddsd    xmm1,xmm2,xmm1
       cmp       eax,r11d
       jae       short M01_L05
       vmovsd    qword ptr [r14+rcx*8+10],xmm1
       inc       edx
       cmp       edx,edi
       jl        short M01_L04
       mov       r15,r14
M01_L03:
       mov       rax,r15
       vmovaps   xmm6,[rsp+50]
       vmovaps   xmm7,[rsp+40]
       vmovaps   xmm8,[rsp+30]
       vmovaps   xmm9,[rsp+20]
       vzeroupper
       add       rsp,60
       pop       rbx
       pop       rbp
       pop       rsi
       pop       rdi
       pop       r12
       pop       r14
       pop       r15
       ret
M01_L04:
       xchg      r14,r15
       jmp       near ptr M01_L00
M01_L05:
       call      CORINFO_HELP_RNGCHKFAIL
M01_L06:
       call      CORINFO_HELP_THROW_ARGUMENTOUTOFRANGEEXCEPTION
M01_L07:
       call      CORINFO_HELP_THROW_ARGUMENTEXCEPTION
       int       3
; Total bytes of code 694

Notably it is still slightly slower. This appears to be largely due to additional movaps inserted before the broadcast instructions:

       vmovaps   xmm9,xmm8
       vbroadcastsd ymm9,xmm9

This appears to be because we insert Avx2.BroadcastScalarToVector256(Vector128.CreateScalarUnsafe(value)) during lowering. The register allocator then assigns a different register for CreateScalarUnsafe even though its marked as tgtPrefUse (this appears to be related to this happening inside a loop).

I'm investigating further to see if there is a trivial fix here (other than expanding Vector256_Create earlier, such as in the importer).

…e cases

tannergooding · 2021-03-11T20:45:20Z

The following demonstrates the "issue" with Vector256.Create(T value) (the same also exists for Vector64/128):
https://sharplab.io/#v2:EYLgxg9gTgpgtADwGwBYA0AXEBDAzgWwB8ABABgAJiBGAOgCUBXAOwwEt8YaBJFqVp3KzC4A3AFgAUGUq1GLdpx4Y+AobhoANABxJxEycQDMlAEzkAwuQDek8ncrGAZgBsI2DOQCyAChduPjgCU1rb2YQBu2FDksLgMzh4AvOQAajBgGNAmAKxINOaw7jC+gXphYcQA7DEwcQk0ACoQAMpg2M5R3qWhdgC+kj0O5H7uXr6uo45o5PwerBgwULjBNhLl9pHRsfFJqemZUDlIADwjGAB8NABaixBl6+SDYY7Q5N6bM+TJpCKfxzMLJa/VgAahBgSe9lWD3KHww+AADl89hksrl8oUFiV7jC7NsEsiAILhBA0QkAE3J3nxGGm8IR3TWD36TPWkLsVRqdQwjRabQ6UC6OPILLCgyMwwmHgAcuN/MMVuzyB8aUSSSYaAAhKBucltXAYVrtKJNNKow65bxmg5UExaDEwIpGgUAVQE2EcxSCjJhnJpvOdnR99hZ4qcUvIsrOw2mswBi2WIVZsKiXJ2yOtaJOZ0uNx1wrFyeer3eqdYyJ+f3jQJmYIhRahSoiqfpaoQGu1uv1hv5JogmYtSCt+2gtvtBUdC0DUDduA9XsCwdxaYJyWJpIpVJpdMRS/KooeSr9tR2Ad7gr3LN6QA=

tannergooding · 2021-03-11T20:45:33Z

CC. @echesakovMSFT

…n ARM64

tannergooding · 2021-03-12T20:51:40Z

CC. @dotnet/jit-contrib

echesakov · 2021-03-12T21:02:04Z

src/coreclr/jit/simdashwintrinsic.cpp

+                    GenTree**      broadcastOp = nullptr;
+
+                    if (varTypeIsArithmetic(op1->TypeGet()))
+                    {


For Arm64 you don't neet broadcast operation when either op1 or op2 are floating-point.
Since the operands will be in SIMD registers already you should use MultiplyByElement (where elementIndex is 0) instead

I also don't think you need to broadcast an integer value to the whole SIMD register.
You can use ins Vd.T[0], Xd followed by mul Vd, Vn, Vd.T[0] instead.

In this case, you can share some of the implementation for floating-point and integer types.

We should need a CreateScalarUnsafe in this case instead then. I'll update to do the right thing.

Notably this wasn't possible before the ARM HWIntrinsics were brought online, which is likely why Vector wasn't doing that already.

Notably, MultiplyByScalar and MultiplyBySelectedScalar don't work for byte/sbyte and so those still need a broadcast.

echesakov · 2021-03-12T21:07:01Z

src/coreclr/jit/simdashwintrinsic.cpp

+                        case TYP_INT:
+                        case TYP_UINT:
+                        {
+                            hwIntrinsic = NI_AVX2_MultiplyLow;


Shouildn't this be guarded with compOpportunisticallyDependsOn(InstructionSet_AVX2)?

No, its already asserted at the top of the method (https://github.com/dotnet/runtime/blob/main/src/coreclr/jit/simdashwintrinsic.cpp#L392) as well as properly guarded further up in the stack.

We should never encounter SimdAsHWIntrinsicClassId::VectorT256 if AVX2 is not supported.

Why at least AVX2 is required for VectorT256? Shouldn't AVX be sufficient?

Vector<T> supports all 10 primitive types and can only properly accelerate the integer types with AVX2 so it was decided it is only 32-bytes if AVX2 is supported and is 16-bytes otherwise, for the best overall perf/experience.

src/libraries/System.Private.CoreLib/src/System/Numerics/Vector_1.cs

tannergooding · 2021-03-24T18:17:52Z

@echesakovMSFT, any other feedback here or is it good to merge?

echesakov

Looks Good.
Thank you

Ensure Vector<T>.op_Multiply is handled as an intrinsic in appropriat…

fca9b85

…e cases

ghost added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Mar 11, 2021

This was referenced Mar 11, 2021

[Perf] Burgers.Test3 Regression on x64 #49071

Closed

Improve Vector<uint> and Vector<ushort> multiply #30923

Closed

tannergooding added 3 commits March 11, 2021 17:45

Applying formatting patch

24373c5

Ensure TYP_BYTE and TYP_UBYTE are handled for Vector<T>.op_Multiply o…

9d7c8c5

…n ARM64

Ensure broadcast nodes are inserted for all operator *(Vector<T>, T)

2b5a43e

tannergooding requested a review from echesakov March 12, 2021 20:51

echesakov suggested changes Mar 12, 2021

View reviewed changes

tannergooding added 3 commits March 12, 2021 13:51

Ensure ARM64 uses MultiplyByScalar when its available

b9c77d7

Applying formatting patch

1b1715d

Ensure the scalar for op_Multiply is op2 on ARM64

a79a1cb

tannergooding force-pushed the fix-49071 branch from 47e1038 to a79a1cb Compare March 15, 2021 18:00

Ensure we do a full multiply for Vector<T> * Vector<T> on ARM64

56c210d

runfoapp bot mentioned this pull request Mar 16, 2021

Mono System.Text.Json.Tests on Windows timing out #42677

Open

Applying formatting patch

2d72fd0

runfoapp bot mentioned this pull request Mar 22, 2021

Test failure Wasm.Build.Tests.WasmBuildAppTest.InvariantGlobalization #49494

Closed

echesakov approved these changes Mar 25, 2021

View reviewed changes

tannergooding merged commit 88c0c48 into dotnet:main Mar 25, 2021

ghost locked as resolved and limited conversation to collaborators Apr 24, 2021

karelz added this to the 6.0.0 milestone May 20, 2021

tannergooding deleted the fix-49071 branch November 11, 2022 15:26

Ensure Vector<T>.op_Multiply is handled as an intrinsic in appropriate cases #49503

Ensure Vector<T>.op_Multiply is handled as an intrinsic in appropriate cases #49503

Uh oh!

Conversation

tannergooding commented Mar 11, 2021 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

.NET 5

.NET 6

Uh oh!

tannergooding commented Mar 11, 2021

Uh oh!

tannergooding commented Mar 11, 2021 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

tannergooding commented Mar 12, 2021

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

tannergooding commented Mar 24, 2021

Uh oh!

echesakov left a comment

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

tannergooding commented Mar 11, 2021 •

edited

Loading

tannergooding commented Mar 11, 2021 •

edited

Loading