CurrCycleInstr = nullptr;
}
+void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
+ assert(IsHazardRecognizerMode);
+
+ unsigned NumPreNoops = PreEmitNoops(MI);
+ EmitNoops(NumPreNoops);
+ if (MI->isInsideBundle())
+ insertNoopsInBundle(MI, TII, NumPreNoops);
+ else
+ TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
+ NumPreNoops);
+ EmitInstruction(MI);
+ AdvanceCycle();
+}
+
unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
IsHazardRecognizerMode = true;
CurrCycleInstr = MI;
fixVALUPartialForwardingHazard(MI);
fixVALUTransUseHazard(MI);
fixWMMAHazards(MI);
+ fixShift64HighRegBug(MI);
}
bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
return true;
}
+bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
+ if (!ST.hasShift64HighRegBug())
+ return false;
+
+ switch (MI->getOpcode()) {
+ default:
+ return false;
+ case AMDGPU::V_LSHLREV_B64_e64:
+ case AMDGPU::V_LSHRREV_B64_e64:
+ case AMDGPU::V_ASHRREV_I64_e64:
+ break;
+ }
+
+ MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
+ if (!Amt->isReg())
+ return false;
+
+ Register AmtReg = Amt->getReg();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ // Check if this is a last VGPR in the allocation block.
+ if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
+ return false;
+
+ if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
+ return false;
+
+ MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
+ bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
+ bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
+ bool Overlapped = OverlappedSrc || OverlappedDst;
+
+ assert(!OverlappedDst || !OverlappedSrc ||
+ Src1->getReg() == MI->getOperand(0).getReg());
+ assert(ST.needsAlignedVGPRs());
+ static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
+
+ Register NewReg;
+ for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
+ : AMDGPU::VGPR_32RegClass) {
+ if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
+ NewReg = Reg;
+ break;
+ }
+ }
+
+ Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
+ : NewReg;
+ Register NewAmtLo;
+
+ if (Overlapped)
+ NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
+
+ DebugLoc DL = MI->getDebugLoc();
+ MachineBasicBlock *MBB = MI->getParent();
+ // Insert a full wait count because found register might be pending a wait.
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
+ .addImm(0);
+
+ // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
+ if (Overlapped)
+ runOnInstruction(
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
+ .addDef(AmtReg - 1)
+ .addReg(AmtReg - 1)
+ .addReg(NewAmtLo));
+ runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
+ .addDef(AmtReg)
+ .addReg(AmtReg)
+ .addReg(NewAmt));
+
+ // Instructions emitted after the current instruction will be processed by the
+ // parent loop of the hazard recognizer in a natural way.
+ BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
+ AmtReg)
+ .addDef(NewAmt)
+ .addReg(NewAmt)
+ .addReg(AmtReg);
+ if (Overlapped)
+ BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
+ AmtReg - 1)
+ .addDef(NewAmtLo)
+ .addReg(NewAmtLo)
+ .addReg(AmtReg - 1);
+
+ // Re-running hazard recognizer on the modified instruction is not necessary,
+ // inserted V_SWAP_B32 has already both read and write new registers so
+ // hazards related to these register has already been handled.
+ Amt->setReg(NewAmt);
+ Amt->setIsKill(false);
+ if (OverlappedDst)
+ MI->getOperand(0).setReg(NewReg);
+ if (OverlappedSrc) {
+ Src1->setReg(NewReg);
+ Src1->setIsKill(false);
+ }
+
+ return true;
+}
+
int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
int NSAtoVMEMWaitStates = 1;
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: highest_reg_shift_amt_v7
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_v7
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr0, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr0, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr0, killed $vgpr2_vgpr3, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr2_vgpr3 = IMPLICIT_DEF
+ renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr2_vgpr3, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_v15
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_v15
+ ; GCN: $vgpr15 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr0, $vgpr15 = V_SWAP_B32 $vgpr15, $vgpr0, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr0, killed $vgpr2_vgpr3, implicit $exec
+ ; GCN-NEXT: $vgpr15, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr15, implicit $exec
+ $vgpr15 = IMPLICIT_DEF
+ $vgpr2_vgpr3 = IMPLICIT_DEF
+ renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr15, killed $vgpr2_vgpr3, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_v255
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_v255
+ ; GCN: $vgpr255 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr0, $vgpr255 = V_SWAP_B32 $vgpr255, $vgpr0, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr0, killed $vgpr2_vgpr3, implicit $exec
+ ; GCN-NEXT: $vgpr255, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr255, implicit $exec
+ $vgpr255 = IMPLICIT_DEF
+ $vgpr2_vgpr3 = IMPLICIT_DEF
+ renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr255, killed $vgpr2_vgpr3, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_used_v0_dst
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_used_v0_dst
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr4, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr4, killed $vgpr2_vgpr3, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr2_vgpr3 = IMPLICIT_DEF
+ renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr2_vgpr3, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_used_v0_src
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_used_v0_src
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr4, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr4, killed $vgpr0_vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr0_vgpr1 = IMPLICIT_DEF
+ renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_used_v0_both
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_used_v0_both
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr2, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr2, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr2, killed $vgpr0_vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr0_vgpr1 = IMPLICIT_DEF
+ renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_overlapped_src
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_overlapped_src
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr2, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr2, implicit $exec
+ ; GCN-NEXT: $vgpr3, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr3, implicit $exec
+ ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr3, $vgpr2_vgpr3, implicit $exec
+ ; GCN-NEXT: $vgpr6, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr6, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr3 = V_SWAP_B32 $vgpr3, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr6_vgpr7 = IMPLICIT_DEF
+ renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_overlapped_dst
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_overlapped_dst
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr2, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr2, implicit $exec
+ ; GCN-NEXT: $vgpr3, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr3, implicit $exec
+ ; GCN-NEXT: $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr3, killed $vgpr0_vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr6, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr6, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr3 = V_SWAP_B32 $vgpr3, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr0_vgpr1 = IMPLICIT_DEF
+ renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_overlapped_both
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_overlapped_both
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr0, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr0, implicit $exec
+ ; GCN-NEXT: $vgpr1, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr1, $vgpr0_vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr6, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr6, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr6_vgpr7 = IMPLICIT_DEF
+ renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_hazard_in_swap
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_hazard_in_swap
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: S_NOP 4
+ ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr4, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr4, killed $vgpr0_vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr0_vgpr1 = IMPLICIT_DEF
+ $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_hazard_in_swap2
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_hazard_in_swap2
+ ; GCN: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr7, $vgpr7, $vgpr1, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr0, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr0, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr1, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr1, $vgpr0_vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr6, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr6, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr7, implicit $exec
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr6_vgpr7 = IMPLICIT_DEF
+ $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr7, $vgpr7, $vgpr1, implicit $exec
+ renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec
+...
+
+---
+name: highest_reg_shift_amt_v7_bundle
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_v7_bundle
+ ; GCN: BUNDLE implicit-def $vgpr2_vgpr3, implicit-def $vgpr7 {
+ ; GCN-NEXT: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr0, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr0, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr0, killed $vgpr2_vgpr3, implicit $exec
+ ; GCN-NEXT: }
+ ; GCN-NEXT: $vgpr7, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr7, implicit $exec
+ BUNDLE implicit-def $vgpr2_vgpr3, implicit-def $vgpr7 {
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr2_vgpr3 = IMPLICIT_DEF
+ renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr2_vgpr3, implicit $exec
+ }
+...
+
+---
+name: highest_reg_shift_amt_hazard_in_swap2_bundle
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: highest_reg_shift_amt_hazard_in_swap2_bundle
+ ; GCN: BUNDLE implicit-def $vgpr1, implicit-def $vgpr6_vgpr7 {
+ ; GCN-NEXT: $vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr7, $vgpr7, $vgpr1, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr0, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr0, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr1, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr1, $vgpr0_vgpr1, implicit $exec
+ ; GCN-NEXT: }
+ ; GCN-NEXT: $vgpr6, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr6, implicit $exec
+ ; GCN-NEXT: $vgpr7, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr7, implicit $exec
+ BUNDLE implicit-def $vgpr1, implicit-def $vgpr6_vgpr7 {
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr6_vgpr7 = IMPLICIT_DEF
+ $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr7, $vgpr7, $vgpr1, implicit $exec
+ renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec
+ }
+...