Skip to content

Commit cd1ca40

Browse files
committed
[NVPTX] use sink symbol for single-element unpacking of v2f32s
1 parent 8e9ae7d commit cd1ca40

File tree

2 files changed

+17
-2
lines changed

2 files changed

+17
-2
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

+15
Original file line numberDiff line numberDiff line change
@@ -3043,6 +3043,16 @@ let hasSideEffects = false in {
30433043
(ins Int64Regs:$s),
30443044
"{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
30453045
[]>;
3046+
def I64toF32HS : NVPTXInst<(outs Float32Regs:$high),
3047+
(ins Int64Regs:$s),
3048+
"mov.b64 {{_, $high}}, $s;",
3049+
[]>,
3050+
Requires<[hasPTX<71>]>;
3051+
def I64toF32LS : NVPTXInst<(outs Float32Regs:$low),
3052+
(ins Int64Regs:$s),
3053+
"mov.b64 {{$low, _}}, $s;",
3054+
[]>,
3055+
Requires<[hasPTX<71>]>;
30463056

30473057
// PTX 7.1 lets you avoid a temp register and just use _ as a "sink" for the
30483058
// unused high/low part.
@@ -3086,6 +3096,11 @@ foreach vt = [v2f16, v2bf16, v2i16] in {
30863096
def : Pat<(extractelt vt:$src, 1), (I32toI16H $src)>;
30873097
}
30883098

3099+
def : Pat<(extractelt v2f32:$src, 0),
3100+
(I64toF32LS $src)>, Requires<[hasPTX<71>]>;
3101+
def : Pat<(extractelt v2f32:$src, 1),
3102+
(I64toF32HS $src)>, Requires<[hasPTX<71>]>;
3103+
30893104
def : Pat<(extractelt v2f32:$src, 0),
30903105
(I64toF32L $src)>;
30913106
def : Pat<(extractelt v2f32:$src, 1),

llvm/test/CodeGen/NVPTX/f32x2-instructions.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ define float @test_extract_0(<2 x float> %a) #0 {
3333
; CHECK-EMPTY:
3434
; CHECK-NEXT: // %bb.0:
3535
; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_0_param_0];
36-
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%f1, tmp}, %rd1; }
36+
; CHECK-NEXT: mov.b64 {%f1, _}, %rd1;
3737
; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
3838
; CHECK-NEXT: ret;
3939
%e = extractelement <2 x float> %a, i32 0
@@ -48,7 +48,7 @@ define float @test_extract_1(<2 x float> %a) #0 {
4848
; CHECK-EMPTY:
4949
; CHECK-NEXT: // %bb.0:
5050
; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_1_param_0];
51-
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %f1}, %rd1; }
51+
; CHECK-NEXT: mov.b64 {_, %f1}, %rd1;
5252
; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
5353
; CHECK-NEXT: ret;
5454
%e = extractelement <2 x float> %a, i32 1

0 commit comments

Comments
 (0)