-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Ensure SCC is not live before shrinking to s_bitset* #167907
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Signed-off-by: John Lu <John.Lu@amd.com>
|
This fixes an issue exposed by #164201. |
|
@llvm/pr-subscribers-backend-amdgpu Author: None (LU-JOHN) ChangesEnsure SCC is not live before shrinking s_and*/s_or* instructions to s_bitset*. Full diff: https://github.com/llvm/llvm-project/pull/167907.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 179ecbad5239f..490c8392de8dd 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -535,6 +535,8 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
/// XNOR (as a ^ b == ~(a ^ ~b)).
/// \returns true if the caller should continue the machine function iterator
bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineBasicBlock::iterator MBBI = MI.getIterator();
unsigned Opc = MI.getOpcode();
const MachineOperand *Dest = &MI.getOperand(0);
MachineOperand *Src0 = &MI.getOperand(1);
@@ -550,7 +552,9 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
uint32_t NewImm = 0;
if (Opc == AMDGPU::S_AND_B32) {
- if (isPowerOf2_32(~Imm)) {
+ if (isPowerOf2_32(~Imm) &&
+ MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, std::next(MBBI), 2) ==
+ MachineBasicBlock::LQR_Dead) {
NewImm = llvm::countr_one(Imm);
Opc = AMDGPU::S_BITSET0_B32;
} else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
@@ -558,7 +562,9 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
Opc = AMDGPU::S_ANDN2_B32;
}
} else if (Opc == AMDGPU::S_OR_B32) {
- if (isPowerOf2_32(Imm)) {
+ if (isPowerOf2_32(Imm) &&
+ MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, std::next(MBBI), 2) ==
+ MachineBasicBlock::LQR_Dead) {
NewImm = llvm::countr_zero(Imm);
Opc = AMDGPU::S_BITSET1_B32;
} else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index 0166d7ac7ddc2..b5228e3054f0a 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -153,6 +153,43 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) {
ret i32 %zext
}
+define amdgpu_ps i32 @and32_clear_one_bit(i32 inreg %val0) {
+; CHECK-LABEL: and32_clear_one_bit:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b32 s0, s0, 0x7fffffff
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = and i32 %val0, 2147483647
+ call void asm "; use $0", "s"(i32 %result)
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @and64_clear_one_bit(i64 inreg %val0) {
+; CHECK-LABEL: and64_clear_one_bit:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bitset0_b32 s0, 31
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = and i64 %val0, -2147483649
+ call void asm "; use $0", "s"(i64 %result)
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: or32:
; CHECK: ; %bb.0:
@@ -623,14 +660,14 @@ define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB36_2
+; CHECK-NEXT: s_cbranch_scc0 .LBB38_2
; CHECK-NEXT: ; %bb.1: ; %endif
; CHECK-NEXT: s_mov_b32 s0, 1
-; CHECK-NEXT: s_branch .LBB36_3
-; CHECK-NEXT: .LBB36_2: ; %if
+; CHECK-NEXT: s_branch .LBB38_3
+; CHECK-NEXT: .LBB38_2: ; %if
; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_branch .LBB36_3
-; CHECK-NEXT: .LBB36_3:
+; CHECK-NEXT: s_branch .LBB38_3
+; CHECK-NEXT: .LBB38_3:
%cmp = icmp ne ptr addrspace(4) @1, null
br i1 %cmp, label %endif, label %if
|
dstutt
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks reasonable.
I haven't verified it against the original regressions we saw yet.. but I can do that if you need it before this is merged. (Probably not required?)
Please verify to see if it fixes your regressions. This will give us more confidence that it fixes the issue or if it only fixes some regressions. |
| if (Opc == AMDGPU::S_AND_B32) { | ||
| if (isPowerOf2_32(~Imm)) { | ||
| if (isPowerOf2_32(~Imm) && | ||
| MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, std::next(MBBI), 2) == |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be better to just check the dead flag on the SCC def.
Neighborhood search of 2 seems too small, dead flag is absolute.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use isDead.
Signed-off-by: John Lu <John.Lu@amd.com>
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Signed-off-by: John Lu <John.Lu@amd.com>
| if (isPowerOf2_32(~Imm)) { | ||
| MachineOperand *SccDef = | ||
| MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); | ||
| if (isPowerOf2_32(~Imm) && SccDef->isDead()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (isPowerOf2_32(~Imm) && SccDef->isDead()) { | |
| if (SccDef->isDead() && isPowerOf2_32(~Imm)) { |
| if (isPowerOf2_32(Imm)) { | ||
| MachineOperand *SccDef = | ||
| MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); | ||
| if (isPowerOf2_32(Imm) && SccDef->isDead()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (isPowerOf2_32(Imm) && SccDef->isDead()) { | |
| if (SccDef->isDead() && isPowerOf2_32(Imm)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alternatively, short-circuit the call to findRegisterDefOperand in the pretty common case that Imm is not a power of two?
| if (isPowerOf2_32(Imm) && SccDef->isDead()) { | |
| if (isPowerOf2_32(Imm) && MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)->isDead()) { |
dstutt
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Confirmed, this fixes the regressions we observed. Thanks.
Ensure SCC is not live before shrinking s_and*/s_or* instructions to s_bitset*.