-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Remove redundant s_cmp_lg_* sX, 0 #162352
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c5a15e5
250302a
613a06d
68cbb6e
b4628ba
a2e6ad4
7b84872
88fdd67
13d73b2
2682020
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B | |
| ; CHECK-LABEL: s_add64_32: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_add_u32 s0, s0, s2 | ||
| ; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 | ||
| ; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 | ||
|
Comment on lines
-183
to
-184
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a nice improvement. But I really wish we could generate better code in the first place, instead of generating horrible code and cleaning it up later.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Our lowering for carryout puts it into a virtual 32/64-bit SGPR. SCC is reconstructed from the virtual 32/64-bit SGPR as needed by users of the carryout. If we lowered directly into SCC we could have problems with other definitions of SCC clobbering the carryout. We could/did somewhat avoid this problem by lowering 64-bit adds as a unit. The internal carryout for the low 32-bit part could be generated cleanly, but the carryout for the high 32-bits was a problem. Note that issue #152992 asked for good code for 64-bit carryout. |
||
| ; CHECK-NEXT: s_addc_u32 s1, s1, s3 | ||
| ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 | ||
| ; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 | ||
| ; CHECK-NEXT: s_addc_u32 s2, s4, 0 | ||
| ; CHECK-NEXT: ; return to shader part epilog | ||
| %sum64 = add i64 %val64A, %val64B | ||
|
|
@@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B | |
| define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) { | ||
| ; CHECK-LABEL: s_uadd_v2i64: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_add_u32 s10, s2, s6 | ||
| ; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 | ||
| ; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0 | ||
| ; CHECK-NEXT: s_addc_u32 s8, s3, s7 | ||
| ; CHECK-NEXT: s_add_u32 s6, s2, s6 | ||
| ; CHECK-NEXT: s_addc_u32 s7, s3, s7 | ||
| ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 | ||
| ; CHECK-NEXT: s_add_u32 s0, s0, s4 | ||
| ; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 | ||
| ; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 | ||
| ; CHECK-NEXT: s_addc_u32 s1, s1, s5 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v2, s0 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v3, s1 | ||
|
|
@@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg | |
| ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] | ||
| ; CHECK-NEXT: v_readfirstlane_b32 s0, v7 | ||
| ; CHECK-NEXT: v_readfirstlane_b32 s2, v6 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v4, s10 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v5, s8 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v4, s6 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v5, s7 | ||
| ; CHECK-NEXT: s_mov_b32 s1, s0 | ||
| ; CHECK-NEXT: s_mov_b32 s3, s2 | ||
| ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] | ||
|
|
@@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg | |
| define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) { | ||
| ; CHECK-LABEL: s_usub_v2i64: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_sub_u32 s10, s2, s6 | ||
| ; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 | ||
| ; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0 | ||
| ; CHECK-NEXT: s_subb_u32 s8, s3, s7 | ||
| ; CHECK-NEXT: s_sub_u32 s6, s2, s6 | ||
| ; CHECK-NEXT: s_subb_u32 s7, s3, s7 | ||
| ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 | ||
| ; CHECK-NEXT: s_sub_u32 s0, s0, s4 | ||
| ; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 | ||
| ; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 | ||
| ; CHECK-NEXT: s_subb_u32 s1, s1, s5 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v2, s0 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v3, s1 | ||
|
|
@@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg | |
| ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] | ||
| ; CHECK-NEXT: v_readfirstlane_b32 s0, v7 | ||
| ; CHECK-NEXT: v_readfirstlane_b32 s2, v6 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v4, s10 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v5, s8 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v4, s6 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v5, s7 | ||
| ; CHECK-NEXT: s_mov_b32 s1, s0 | ||
| ; CHECK-NEXT: s_mov_b32 s3, s2 | ||
| ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] | ||
|
|
@@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) | |
| ; CHECK-LABEL: s_uadd_i64: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_add_u32 s0, s0, s2 | ||
| ; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 | ||
| ; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 | ||
| ; CHECK-NEXT: s_addc_u32 s1, s1, s3 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v2, s0 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v3, s1 | ||
|
|
@@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { | |
| ; CHECK-LABEL: s_uadd_p1: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_add_u32 s0, s0, 1 | ||
| ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 | ||
| ; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 | ||
| ; CHECK-NEXT: s_addc_u32 s1, s1, 0 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v2, s0 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v3, s1 | ||
|
|
@@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { | |
| ; CHECK-LABEL: s_usub_p1: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_sub_u32 s0, s0, 1 | ||
| ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 | ||
| ; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 | ||
| ; CHECK-NEXT: s_subb_u32 s1, s1, 0 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v2, s0 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v3, s1 | ||
|
|
@@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { | |
| ; CHECK-LABEL: s_usub_n1: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_sub_u32 s0, s0, -1 | ||
| ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 | ||
| ; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 | ||
| ; CHECK-NEXT: s_subb_u32 s1, s1, -1 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v2, s0 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v3, s1 | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.