MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
unsigned Op32) const {
- MachineBasicBlock *MBB = MI.getParent();;
+ MachineBasicBlock *MBB = MI.getParent();
MachineInstrBuilder Inst32 =
BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
.setMIFlags(MI.getFlags());
// Add the dst operand if the 32-bit encoding also has an explicit $vdst.
// For VOPC instructions, this is replaced by an implicit def of vcc.
- int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
- if (Op32DstIdx != -1) {
+ if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst) != -1) {
// dst
Inst32.add(MI.getOperand(0));
- } else {
+ } else if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::sdst) != -1) {
+ // VOPCX instructions won't be writing to an explicit dst, so this should
+ // not fail for these instructions.
assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
(MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
"Unexpected case");
LLVM_READONLY
int getMFMAEarlyClobberOp(uint16_t Opcode);
+ /// \returns v_cmpx version of a v_cmp instruction.
+ LLVM_READONLY
+ int getVCMPXOpFromVCMP(uint16_t Opcode);
+
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
let ValueCols = [["0"]];
}
+// Maps an v_cmp instruction to its v_cmpx equivalent.
+def getVCMPXOpFromVCMP : InstrMapping {
+ let FilterClass = "VCMPVCMPXTable";
+ let RowFields = ["VCMPOp"];
+ let ColFields = ["IsVCMPX"];
+ let KeyCol = ["0"];
+ let ValueCols = [["1"]];
+}
+
include "SIInstructions.td"
include "DSInstructions.td"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
return false;
}
+// Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
+// the beginning of the BB is reached or Pred evaluates to true - which can be
+// an arbitrary condition based on the current MachineInstr, for instance an
+// target instruction. Breaks prematurely by returning nullptr if one of the
+// registers given in NonModifiableRegs is modified by the current instruction.
+static MachineInstr *
+findInstrBackwards(MachineInstr &Origin,
+ std::function<bool(MachineInstr *)> Pred,
+ ArrayRef<MCRegister> NonModifiableRegs,
+ const SIRegisterInfo *TRI, unsigned MaxInstructions = 5) {
+ MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
+ E = Origin.getParent()->rend();
+ unsigned CurrentIteration = 0;
+
+ for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
+ if (Pred(&*A))
+ return &*A;
+
+ for (MCRegister Reg : NonModifiableRegs) {
+ if (A->modifiesRegister(Reg, TRI))
+ return nullptr;
+ }
+
+ ++CurrentIteration;
+ }
+
+ return nullptr;
+}
+
+// Determine if a register Reg is not re-defined and still in use
+// in the range (Stop..BB.end].
+// It does so by backwards calculating liveness from the end of the BB until
+// either Stop or the beginning of the BB is reached.
+// After liveness is calculated, we can determine if Reg is still in use and not
+// defined inbetween the instructions.
+static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg,
+ const SIRegisterInfo *TRI,
+ MachineRegisterInfo &MRI) {
+ LivePhysRegs LR(*TRI);
+ LR.addLiveOuts(*Stop.getParent());
+
+ for (auto A = Stop.getParent()->rbegin();
+ A != Stop.getParent()->rend() && A != Stop; ++A) {
+ LR.stepBackward(*A);
+ }
+
+ return !LR.available(MRI, Reg);
+}
+
+// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
+// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
+// to the v_cmp instruction if it is safe to replace the sequence (see the
+// conditions in the function body). This is after register allocation, so some
+// checks on operand dependencies need to be considered.
+static MachineInstr *findPossibleVCMPVCMPXOptimization(
+ MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI,
+ const SIInstrInfo *TII, MachineRegisterInfo &MRI) {
+
+ MachineInstr *VCmp = nullptr;
+
+ Register SaveExecDest = SaveExec.getOperand(0).getReg();
+ if (!TRI->isSGPRReg(MRI, SaveExecDest))
+ return nullptr;
+
+ MachineOperand *SaveExecSrc0 =
+ TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
+ if (!SaveExecSrc0->isReg())
+ return nullptr;
+
+ // Try to find the last v_cmp instruction that defs the saveexec input
+ // operand without any write to Exec or the saveexec input operand inbetween.
+ VCmp = findInstrBackwards(
+ SaveExec,
+ [&](MachineInstr *Check) {
+ return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
+ Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
+ },
+ {Exec, SaveExecSrc0->getReg()}, TRI);
+
+ if (!VCmp)
+ return nullptr;
+
+ MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
+ assert(VCmpDest && "Should have an sdst operand!");
+
+ // Check if any of the v_cmp source operands is written by the saveexec.
+ MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
+ if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) &&
+ SaveExec.modifiesRegister(Src0->getReg(), TRI))
+ return nullptr;
+
+ MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
+ if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) &&
+ SaveExec.modifiesRegister(Src1->getReg(), TRI))
+ return nullptr;
+
+ // Don't do the transformation if the destination operand is included in
+ // it's MBB Live-outs, meaning it's used in any of it's successors, leading
+ // to incorrect code if the v_cmp and therefore the def of
+ // the dest operand is removed.
+ if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
+ return nullptr;
+
+ // If the v_cmp target is in use after the s_and_saveexec, skip the
+ // optimization.
+ if (isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI))
+ return nullptr;
+
+ // Try to determine if there is a write to any of the VCmp
+ // operands between the saveexec and the vcmp.
+ // If yes, additional VGPR spilling might need to be inserted. In this case,
+ // it's not worth replacing the instruction sequence.
+ SmallVector<MCRegister, 2> NonDefRegs;
+ if (Src0->isReg())
+ NonDefRegs.push_back(Src0->getReg());
+
+ if (Src1->isReg())
+ NonDefRegs.push_back(Src1->getReg());
+
+ if (!findInstrBackwards(
+ SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
+ NonDefRegs, TRI))
+ return nullptr;
+
+ return VCmp;
+}
+
+// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
+// operands extracted from a v_cmp ..., s_and_saveexec pattern.
+static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
+ MachineInstr &VCmp, MCRegister Exec,
+ const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI,
+ MachineRegisterInfo &MRI) {
+ const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
+
+ if (NewOpcode == -1)
+ return false;
+
+ MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
+
+ Register MoveDest = SaveExecInstr.getOperand(0).getReg();
+
+ MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
+ if (!SaveExecInstr.uses().empty()) {
+ bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32;
+ unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
+ SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
+ .addReg(Exec);
+ }
+
+ // Omit dst as V_CMPX is implicitly writing to EXEC.
+ // Add dummy src and clamp modifiers, if needed.
+ auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
+ VCmp.getDebugLoc(), TII->get(NewOpcode));
+
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) !=
+ -1)
+ Builder.addImm(0);
+
+ Builder.add(*Src0);
+
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1_modifiers) !=
+ -1)
+ Builder.addImm(0);
+
+ Builder.add(*Src1);
+
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) != -1)
+ Builder.addImm(0);
+
+ return true;
+}
+
bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
// Optimize sequences emitted for control flow lowering. They are originally
Changed = true;
}
+ // After all s_op_saveexec instructions are inserted,
+ // replace (on GFX10.3 and later)
+ // v_cmp_* SGPR, IMM, VGPR
+ // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
+ // with
+ // s_mov_b32 EXEC_SGPR_DEST, exec_lo
+ // v_cmpx_* IMM, VGPR
+ // to reduce pipeline stalls.
+ if (ST.hasGFX10_3Insts()) {
+ DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
+ const unsigned AndSaveExecOpcode =
+ ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ // Record relevant v_cmp / s_and_saveexec instruction pairs for
+ // replacement.
+ if (MI.getOpcode() != AndSaveExecOpcode)
+ continue;
+
+ if (MachineInstr *VCmp =
+ findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI))
+ SaveExecVCmpMapping[&MI] = VCmp;
+ }
+ }
+
+ for (const auto &Entry : SaveExecVCmpMapping) {
+ MachineInstr *SaveExecInstr = Entry.getFirst();
+ MachineInstr *VCmpInstr = Entry.getSecond();
+
+ if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII,
+ TRI, *MRI)) {
+ SaveExecInstr->eraseFromParent();
+ VCmpInstr->eraseFromParent();
+
+ Changed = true;
+ }
+ }
+ }
+
return Changed;
}
int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
if (TII->isVOPC(Op32)) {
- Register DstReg = MI.getOperand(0).getReg();
- if (DstReg.isVirtual()) {
- // VOPC instructions can only write to the VCC register. We can't
- // force them to use VCC here, because this is only one register and
- // cannot deal with sequences which would require multiple copies of
- // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
- //
- // So, instead of forcing the instruction to write to VCC, we provide
- // a hint to the register allocator to use VCC and then we will run
- // this pass again after RA and shrink it if it outputs to VCC.
- MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg);
- continue;
+ MachineOperand &Op0 = MI.getOperand(0);
+ if (Op0.isReg()) {
+ // Exclude VOPCX instructions as these don't explicitly write a
+ // dst.
+ Register DstReg = Op0.getReg();
+ if (DstReg.isVirtual()) {
+ // VOPC instructions can only write to the VCC register. We can't
+ // force them to use VCC here, because this is only one register and
+ // cannot deal with sequences which would require multiple copies of
+ // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
+ //
+ // So, instead of forcing the instruction to write to VCC, we
+ // provide a hint to the register allocator to use VCC and then we
+ // will run this pass again after RA and shrink it if it outputs to
+ // VCC.
+ MRI.setRegAllocationHint(DstReg, 0, VCCReg);
+ continue;
+ }
+ if (DstReg != VCCReg)
+ continue;
}
- if (DstReg != VCCReg)
- continue;
}
if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
string NoSDstOp = Name;
}
+class VCMPVCMPXTable <string Name> {
+ bit IsVCMPX = 0;
+ string VCMPOp = Name;
+}
+
multiclass VOPC_Pseudos <string opName,
VOPC_Profile P,
SDPatternOperator cond = COND_NULL,
def _e32 : VOPC_Pseudo <opName, P>,
Commutable_REV<revOp#"_e32", !eq(revOp, opName)>,
- VCMPXNoSDstTable<1, opName#"_e32"> {
+ VCMPXNoSDstTable<1, opName#"_e32">,
+ VCMPVCMPXTable<opName#"_e32"> {
let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
let SchedRW = P.Schedule;
let isConvergent = DefExec;
def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)>,
- VCMPXNoSDstTable<1, opName#"_e64"> {
+ VCMPXNoSDstTable<1, opName#"_e64">,
+ VCMPVCMPXTable<opName#"_e64"> {
let Defs = !if(DefExec, [EXEC], []);
let SchedRW = P.Schedule;
let isCompare = 1;
def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>,
Commutable_REV<revOp#"_nosdst_e32", !eq(revOp, opName)>,
- VCMPXNoSDstTable<0, opName#"_e32"> {
+ VCMPXNoSDstTable<0, opName#"_e32">,
+ VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e32")> {
let Defs = [EXEC];
let SchedRW = P_NoSDst.Schedule;
let isConvergent = 1;
let isCompare = 1;
let isCommutable = 1;
let SubtargetPredicate = HasNoSdstCMPX;
+ let IsVCMPX = 1;
}
def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>,
Commutable_REV<revOp#"_nosdst_e64", !eq(revOp, opName)>,
- VCMPXNoSDstTable<0, opName#"_e64"> {
+ VCMPXNoSDstTable<0, opName#"_e64">,
+ VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e64")> {
let Defs = [EXEC];
let SchedRW = P_NoSDst.Schedule;
let isCompare = 1;
let isCommutable = 1;
let SubtargetPredicate = HasNoSdstCMPX;
+ let IsVCMPX = 1;
}
foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in
; GCN-LABEL: long_forward_scc_branch_3f_offset_bug:
; GFX1030: s_cmp_lg_u32
-; GFX1030-NEXT: s_cbranch_scc1 [[ENDBB:.LBB[0-9]+_[0-9]+]]
+; GFX1030: s_cbranch_scc1 [[ENDBB:.LBB[0-9]+_[0-9]+]]
; GFX1010: s_cmp_lg_u32
; GFX1010-NEXT: s_cbranch_scc0 [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
}
; GCN-LABEL: {{^}}long_forward_exec_branch_3f_offset_bug:
-; GFX1030: v_cmp_eq_u32
-; GFX1030: s_and_saveexec_b32
-; GFX1030-NEXT: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
+; GFX1030: s_mov_b32
+; GFX1030: v_cmpx_eq_u32
+; GFX1030: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
; GFX1010: v_cmp_eq_u32
; GFX1010: s_and_saveexec_b32
--- /dev/null
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_lt:
+; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 15, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_lt_i32_e32 15, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_lt(i32 %x) {
+entry:
+ %bc = icmp slt i32 %x, 16
+ br i1 %bc, label %endif, label %if
+
+if:
+ %ret = shl i32 %x, 2
+ ret i32 %ret
+
+endif:
+ ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_gt:
+; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 17, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_gt_i32_e32 17, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_gt(i32 %x) {
+entry:
+ %bc = icmp sgt i32 %x, 16
+ br i1 %bc, label %endif, label %if
+
+if:
+ %ret = shl i32 %x, 2
+ ret i32 %ret
+
+endif:
+ ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_eq:
+; GFX1010: v_cmp_ne_u32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_ne_u32_e32 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_eq(i32 %x) {
+entry:
+ %bc = icmp eq i32 %x, 16
+ br i1 %bc, label %endif, label %if
+
+if:
+ %ret = shl i32 %x, 2
+ ret i32 %ret
+
+endif:
+ ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ne:
+; GFX1010: v_cmp_eq_u32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_eq_u32_e32 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_ne(i32 %x) {
+entry:
+ %bc = icmp ne i32 %x, 16
+ br i1 %bc, label %endif, label %if
+
+if:
+ %ret = shl i32 %x, 2
+ ret i32 %ret
+
+endif:
+ ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_le:
+; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_lt_i32_e32 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_le(i32 %x) {
+entry:
+ %bc = icmp sle i32 %x, 16
+ br i1 %bc, label %endif, label %if
+
+if:
+ %ret = shl i32 %x, 2
+ ret i32 %ret
+
+endif:
+ ret i32 %x
+}
+
+; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ge:
+; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 16, v{{.*}}
+; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
+; GFX1030: s_mov_b32 s{{.*}}, exec_lo
+; GFX1030-NEXT: v_cmpx_gt_i32_e32 16, v{{.*}}
+define i32 @test_insert_vcmpx_pattern_ge(i32 %x) {
+entry:
+ %bc = icmp sge i32 %x, 16
+ br i1 %bc, label %endif, label %if
+
+if:
+ %ret = shl i32 %x, 2
+ ret i32 %ret
+
+endif:
+ ret i32 %x
+}
+
+declare amdgpu_gfx void @check_live_outs_helper(i64) #0
+
+; In cases where the output operand cannot be safely removed,
+; don't apply the v_cmpx transformation.
+
+; GCN-LABEL: {{^}}check_live_outs:
+; GFX1010: v_cmp_eq_u32_e64 s{{.*}}, v{{.*}}, v{{.*}}
+; GFX1010: s_and_saveexec_b32 s{{.*}}, s{{.*}}
+; GFX1030: v_cmp_eq_u32_e64 s{{.*}}, v{{.*}}, v{{.*}}
+; GFX1030: s_and_saveexec_b32 s{{.*}}, s{{.*}}
+define amdgpu_cs void @check_live_outs(i32 %a, i32 %b) {
+ %cond = icmp eq i32 %a, %b
+ %result = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+ br i1 %cond, label %l1, label %l2
+l1:
+ call amdgpu_gfx void @check_live_outs_helper(i64 %result)
+ br label %l2
+l2:
+ ret void
+}
+
+; Omit the transformation if the s_and_saveexec instruction overwrites
+; any of the v_cmp source operands.
+
+; GCN-LABEL: check_saveexec_overwrites_vcmp_source:
+; GCN: ; %bb.1: ; %then
+; GFX1010: v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
+; GFX1010-NEXT: v_mov_b32_e32 {{.*}}, s[[A]]
+; GFX1010-NEXT: s_and_saveexec_b32 s[[A]], vcc_lo
+; GFX1030: v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
+; GFX1030-NEXT: v_mov_b32_e32 {{.*}}, s[[A]]
+; GFX1030-NEXT: s_and_saveexec_b32 s[[A]], vcc_lo
+define i32 @check_saveexec_overwrites_vcmp_source(i32 inreg %a, i32 inreg %b) {
+entry:
+ %0 = icmp sge i32 %a, 0
+ br i1 %0, label %if, label %then
+
+if:
+ %1 = shl i32 %a, 2
+ %2 = or i32 %1, %b
+ ret i32 %2
+
+then:
+ %3 = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+ %4 = trunc i64 %3 to i32
+ %5 = icmp slt i32 %4, %b
+ br i1 %5, label %after, label %end
+
+after:
+ ret i32 %4
+
+end:
+ ret i32 %a
+}
+
+declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0
--- /dev/null
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+
+# After the Optimize exec masking (post-RA) pass, there's a change of having v_cmpx instructions
+# being introduced whenever there's a sequence of v_cmp and s_and_saveexec instructions
+# which can be safely replaced in various cases.
+# However, it is not safe to do so when the generated code sequence would omit part of the EXEC mask
+# which could occur when a subset of EXEC is used as input operand in the v_cmp instruction.
+# The idea behind this test is to check if the subregisters are correctly handled here.
+
+# GCN-LABEL: name: vcmp_saveexec_to_mov_vcmpx_exec_subreg
+# GCN: V_CMP_GT_U32_e64
+# GCN: S_AND_SAVEEXEC_B64
+name: vcmp_saveexec_to_mov_vcmpx_exec_subreg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr2
+ renamable $sgpr0_sgpr1 = V_CMP_GT_U32_e64 $sgpr2, killed $vgpr0, implicit $exec
+ $sgpr2_sgpr3 = COPY $exec, implicit-def $exec
+ $sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr2_sgpr3, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ $exec = S_MOV_B64_term killed renamable $sgpr2_sgpr3
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1
; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13
; GFX10-W32-NEXT: s_cbranch_execz .LBB23_2
; GFX10-W32-NEXT: ; %bb.1: ; %ELSE
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1
; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13
; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v1
; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
+; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: ; implicit-def: $vgpr0
-; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX10-W32-NEXT: v_cmpx_nlt_f32_e32 0, v1
; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX10-W32-NEXT: ; %bb.1: ; %ELSE
; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13
-; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
+; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo