bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
- bool optimizeVccBranch(MachineInstr &MI) const;
-
public:
static char ID;
return true;
}
-bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const {
- // Match:
- // sreg = -1
- // vcc = S_AND_B64 exec, sreg
- // S_CBRANCH_VCC[N]Z
- // =>
- // S_CBRANCH_EXEC[N]Z
- bool Changed = false;
- MachineBasicBlock &MBB = *MI.getParent();
- const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
- const bool IsWave32 = ST.isWave32();
- const unsigned CondReg = TRI->getVCC();
- const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
-
- MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
- E = MBB.rend();
- bool ReadsCond = false;
- unsigned Threshold = 5;
- for (++A ; A != E ; ++A) {
- if (!--Threshold)
- return false;
- if (A->modifiesRegister(ExecReg, TRI))
- return false;
- if (A->modifiesRegister(CondReg, TRI)) {
- if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
- return false;
- break;
- }
- ReadsCond |= A->readsRegister(CondReg, TRI);
- }
- if (A == E)
- return false;
-
- MachineOperand &Op1 = A->getOperand(1);
- MachineOperand &Op2 = A->getOperand(2);
- if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
- TII->commuteInstruction(*A);
- Changed = true;
- }
- if (Op1.getReg() != ExecReg)
- return Changed;
- if (Op2.isImm() && Op2.getImm() != -1)
- return Changed;
-
- unsigned SReg = AMDGPU::NoRegister;
- if (Op2.isReg()) {
- SReg = Op2.getReg();
- auto M = std::next(A);
- bool ReadsSreg = false;
- for ( ; M != E ; ++M) {
- if (M->definesRegister(SReg, TRI))
- break;
- if (M->modifiesRegister(SReg, TRI))
- return Changed;
- ReadsSreg |= M->readsRegister(SReg, TRI);
- }
- if (M == E ||
- !M->isMoveImmediate() ||
- !M->getOperand(1).isImm() ||
- M->getOperand(1).getImm() != -1)
- return Changed;
- // First if sreg is only used in and instruction fold the immediate
- // into that and.
- if (!ReadsSreg && Op2.isKill()) {
- A->getOperand(2).ChangeToImmediate(-1);
- M->eraseFromParent();
- }
- }
-
- if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
- MI.killsRegister(CondReg, TRI))
- A->eraseFromParent();
-
- bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
- if (SReg == ExecReg) {
- if (IsVCCZ) {
- MI.eraseFromParent();
- return true;
- }
- MI.setDesc(TII->get(AMDGPU::S_BRANCH));
- } else {
- MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ
- : AMDGPU::S_CBRANCH_EXECNZ));
- }
-
- MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
- MI.addImplicitDefUseOperands(*MBB.getParent());
-
- return true;
-}
-
bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
}
break;
- case AMDGPU::S_CBRANCH_VCCZ:
- case AMDGPU::S_CBRANCH_VCCNZ:
- MadeChange |= optimizeVccBranch(MI);
- break;
-
default:
break;
}
--- /dev/null
+//===-- SIPreEmitPeephole.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass performs the peephole optimizations before code emission.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-pre-emit-peephole"
+
+namespace {
+
+class SIPreEmitPeephole : public MachineFunctionPass {
+private:
+ const SIInstrInfo *TII = nullptr;
+ const SIRegisterInfo *TRI = nullptr;
+
+ bool optimizeVccBranch(MachineInstr &MI) const;
+
+public:
+ static char ID;
+
+ SIPreEmitPeephole() : MachineFunctionPass(ID) {
+ initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
+ "SI peephole optimizations", false, false)
+
+char SIPreEmitPeephole::ID = 0;
+
+char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
+
+bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
+ // Match:
+ // sreg = -1
+ // vcc = S_AND_B64 exec, sreg
+ // S_CBRANCH_VCC[N]Z
+ // =>
+ // S_CBRANCH_EXEC[N]Z
+ // We end up with this pattern sometimes after basic block placement.
+ // It happens while combining a block which assigns -1 to a saved mask and
+ // another block which consumes that saved mask and then a branch.
+ bool Changed = false;
+ MachineBasicBlock &MBB = *MI.getParent();
+ const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ const bool IsWave32 = ST.isWave32();
+ const unsigned CondReg = TRI->getVCC();
+ const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+
+ MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
+ E = MBB.rend();
+ bool ReadsCond = false;
+ unsigned Threshold = 5;
+ for (++A; A != E; ++A) {
+ if (!--Threshold)
+ return false;
+ if (A->modifiesRegister(ExecReg, TRI))
+ return false;
+ if (A->modifiesRegister(CondReg, TRI)) {
+ if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
+ return false;
+ break;
+ }
+ ReadsCond |= A->readsRegister(CondReg, TRI);
+ }
+ if (A == E)
+ return false;
+
+ MachineOperand &Op1 = A->getOperand(1);
+ MachineOperand &Op2 = A->getOperand(2);
+ if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
+ TII->commuteInstruction(*A);
+ Changed = true;
+ }
+ if (Op1.getReg() != ExecReg)
+ return Changed;
+ if (Op2.isImm() && Op2.getImm() != -1)
+ return Changed;
+
+ Register SReg;
+ if (Op2.isReg()) {
+ SReg = Op2.getReg();
+ auto M = std::next(A);
+ bool ReadsSreg = false;
+ for (; M != E; ++M) {
+ if (M->definesRegister(SReg, TRI))
+ break;
+ if (M->modifiesRegister(SReg, TRI))
+ return Changed;
+ ReadsSreg |= M->readsRegister(SReg, TRI);
+ }
+ if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
+ M->getOperand(1).getImm() != -1)
+ return Changed;
+ // First if sreg is only used in and instruction fold the immediate
+ // into that and.
+ if (!ReadsSreg && Op2.isKill()) {
+ A->getOperand(2).ChangeToImmediate(-1);
+ M->eraseFromParent();
+ }
+ }
+
+ if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
+ MI.killsRegister(CondReg, TRI))
+ A->eraseFromParent();
+
+ bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
+ if (SReg == ExecReg) {
+ if (IsVCCZ) {
+ MI.eraseFromParent();
+ return true;
+ }
+ MI.setDesc(TII->get(AMDGPU::S_BRANCH));
+ } else {
+ MI.setDesc(
+ TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
+ }
+
+ MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
+ MI.addImplicitDefUseOperands(*MBB.getParent());
+
+ return true;
+}
+
+bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ if (MBBI == MBB.end())
+ continue;
+
+ MachineInstr &MI = *MBBI;
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_CBRANCH_VCCZ:
+ case AMDGPU::S_CBRANCH_VCCNZ:
+ Changed |= optimizeVccBranch(MI);
+ break;
+ default:
+ break;
+ }
+ }
+
+ return Changed;
+}