From 4c4d2fe280bc8a04598e7e9d75c63fb64ee4cd81 Mon Sep 17 00:00:00 2001 From: Tim Corringham Date: Mon, 10 Dec 2018 12:06:10 +0000 Subject: [PATCH] [AMDGPU] Add new Mode Register pass A new pass to manage the Mode register. Currently this just manages the floating point double precision rounding requirements, but is intended to be easily extended to encompass all Mode register settings. The immediate motivation comes from the requirement to use the round-to-zero rounding mode for the 16 bit interpolation instructions, where the rounding mode setting is shared between 16 and 64 bit operations. llvm-svn: 348754 --- llvm/lib/Target/AMDGPU/AMDGPU.h | 4 + llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 9 + llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + llvm/lib/Target/AMDGPU/SIDefines.h | 5 +- llvm/lib/Target/AMDGPU/SIInstrFormats.td | 6 + llvm/lib/Target/AMDGPU/SIInstrInfo.h | 8 + llvm/lib/Target/AMDGPU/SIModeRegister.cpp | 406 ++++++++++++++++++++++ llvm/lib/Target/AMDGPU/VOP1Instructions.td | 8 + llvm/lib/Target/AMDGPU/VOP2Instructions.td | 7 +- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 36 +- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 8 +- llvm/test/CodeGen/AMDGPU/mode-register.mir | 459 +++++++++++++++++++++++++ 12 files changed, 946 insertions(+), 11 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/SIModeRegister.cpp create mode 100644 llvm/test/CodeGen/AMDGPU/mode-register.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index b77b1f8ad..d26397a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -59,6 +59,7 @@ FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); FunctionPass *createAMDGPURewriteOutArgumentsPass(); +FunctionPass *createSIModeRegisterPass(); void initializeAMDGPUDAGToDAGISelPass(PassRegistry&); @@ -195,6 +196,9 @@ extern char &SIMemoryLegalizerID; void initializeSIDebuggerInsertNopsPass(PassRegistry&); extern char &SIDebuggerInsertNopsID; +void initializeSIModeRegisterPass(PassRegistry&); +extern char &SIModeRegisterID; + void initializeSIInsertWaitcntsPass(PassRegistry&); extern char &SIInsertWaitcntsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e3ab1fb..70d365f4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -150,6 +150,13 @@ static cl::opt EnableAtomicOptimizations( cl::init(false), cl::Hidden); +// Enable Mode register optimization +static cl::opt EnableSIModeRegisterPass( + "amdgpu-mode-register", + cl::desc("Enable mode register pass"), + cl::init(true), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -189,6 +196,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); initializeSIInsertWaitcntsPass(*PR); + initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); @@ -894,6 +902,7 @@ void GCNPassConfig::addPreEmitPass() { addPass(createSIMemoryLegalizerPass()); addPass(createSIInsertWaitcntsPass()); addPass(createSIShrinkInstructionsPass()); + addPass(createSIModeRegisterPass()); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index bdfaaba..7d12199 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -120,6 +120,7 @@ add_llvm_target(AMDGPUCodeGen SIWholeQuadMode.cpp GCNILPSched.cpp GCNDPPCombine.cpp + SIModeRegister.cpp ) add_subdirectory(AsmParser) diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index a6d28d6..7f6abc3 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -88,7 +88,10 @@ enum : uint64_t { IsPacked = UINT64_C(1) << 49, // Is a D16 buffer instruction. - D16Buf = UINT64_C(1) << 50 + D16Buf = UINT64_C(1) << 50, + + // Uses floating point double precision rounding mode + FPDPRounding = UINT64_C(1) << 51 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index b73d309..65ffc27 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -121,6 +121,10 @@ class InstSI DisableSIDecoder = 0; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index ccccd99..5b1a05f3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -604,6 +604,14 @@ public: return MI.getDesc().TSFlags & ClampFlags; } + static bool usesFPDPRounding(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding; + } + + bool usesFPDPRounding(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding; + } + bool isVGPRCopy(const MachineInstr &MI) const { assert(MI.isCopy()); unsigned Dest = MI.getOperand(0).getReg(); diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp new file mode 100644 index 0000000..c4cad95 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -0,0 +1,406 @@ +//===-- SIModeRegister.cpp - Mode Register --------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This pass inserts changes to the Mode register settings as required. +/// Note that currently it only deals with the Double Precision Floating Point +/// rounding mode setting, but is intended to be generic enough to be easily +/// expanded. +/// +//===----------------------------------------------------------------------===// +// +#include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include + +#define DEBUG_TYPE "si-mode-register" + +STATISTIC(NumSetregInserted, "Number of setreg of mode register inserted."); + +using namespace llvm; + +struct Status { + // Mask is a bitmask where a '1' indicates the corresponding Mode bit has a + // known value + unsigned Mask; + unsigned Mode; + + Status() : Mask(0), Mode(0){}; + + Status(unsigned Mask, unsigned Mode) : Mask(Mask), Mode(Mode) { + Mode &= Mask; + }; + + // merge two status values such that only values that don't conflict are + // preserved + Status merge(const Status &S) const { + return Status((Mask | S.Mask), ((Mode & ~S.Mask) | (S.Mode & S.Mask))); + } + + // merge an unknown value by using the unknown value's mask to remove bits + // from the result + Status mergeUnknown(unsigned newMask) { + return Status(Mask & ~newMask, Mode & ~newMask); + } + + // intersect two Status values to produce a mode and mask that is a subset + // of both values + Status intersect(const Status &S) const { + unsigned NewMask = (Mask & S.Mask) & (Mode ^ ~S.Mode); + unsigned NewMode = (Mode & NewMask); + return Status(NewMask, NewMode); + } + + // produce the delta required to change the Mode to the required Mode + Status delta(const Status &S) const { + return Status((S.Mask & (Mode ^ S.Mode)) | (~Mask & S.Mask), S.Mode); + } + + bool operator==(const Status &S) const { + return (Mask == S.Mask) && (Mode == S.Mode); + } + + bool operator!=(const Status &S) const { return !(*this == S); } + + bool isCompatible(Status &S) { + return ((Mask & S.Mask) == S.Mask) && ((Mode & S.Mask) == S.Mode); + } + + bool isCombinable(Status &S) { + return !(Mask & S.Mask) || isCompatible(S); + } +}; + +class BlockData { +public: + // The Status that represents the mode register settings required by the + // FirstInsertionPoint (if any) in this block. Calculated in Phase 1. + Status Require; + + // The Status that represents the net changes to the Mode register made by + // this block, Calculated in Phase 1. + Status Change; + + // The Status that represents the mode register settings on exit from this + // block. Calculated in Phase 2. + Status Exit; + + // The Status that represents the intersection of exit Mode register settings + // from all predecessor blocks. Calculated in Phase 2, and used by Phase 3. + Status Pred; + + // In Phase 1 we record the first instruction that has a mode requirement, + // which is used in Phase 3 if we need to insert a mode change. + MachineInstr *FirstInsertionPoint; + + BlockData() : FirstInsertionPoint(nullptr) {}; +}; + +namespace { + +class SIModeRegister : public MachineFunctionPass { +public: + static char ID; + + std::vector> BlockInfo; + std::queue Phase2List; + + // The default mode register setting currently only caters for the floating + // point double precision rounding mode. + // We currently assume the default rounding mode is Round to Nearest + // NOTE: this should come from a per function rounding mode setting once such + // a setting exists. + unsigned DefaultMode = FP_ROUND_ROUND_TO_NEAREST; + Status DefaultStatus = + Status(FP_ROUND_MODE_DP(0x3), FP_ROUND_MODE_DP(DefaultMode)); + +public: + SIModeRegister() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + void processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + void processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + void processBlockPhase3(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + Status getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII); + + void insertSetreg(MachineBasicBlock &MBB, MachineInstr *I, + const SIInstrInfo *TII, Status InstrMode); +}; +} // End anonymous namespace. + +INITIALIZE_PASS(SIModeRegister, DEBUG_TYPE, + "Insert required mode register values", false, false) + +char SIModeRegister::ID = 0; + +char &llvm::SIModeRegisterID = SIModeRegister::ID; + +FunctionPass *llvm::createSIModeRegisterPass() { return new SIModeRegister(); } + +// Determine the Mode register setting required for this instruction. +// Instructions which don't use the Mode register return a null Status. +// Note this currently only deals with instructions that use the floating point +// double precision setting. +Status SIModeRegister::getInstructionMode(MachineInstr &MI, + const SIInstrInfo *TII) { + if (TII->usesFPDPRounding(MI)) { + switch (MI.getOpcode()) { + case AMDGPU::V_INTERP_P1LL_F16: + case AMDGPU::V_INTERP_P1LV_F16: + case AMDGPU::V_INTERP_P2_F16: + // f16 interpolation instructions need double precision round to zero + return Status(FP_ROUND_MODE_DP(3), + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO)); + default: + return DefaultStatus; + } + } + return Status(); +} + +// Insert a setreg instruction to update the Mode register. +// It is possible (though unlikely) for an instruction to require a change to +// the value of disjoint parts of the Mode register when we don't know the +// value of the intervening bits. In that case we need to use more than one +// setreg instruction. +void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, + const SIInstrInfo *TII, Status InstrMode) { + while (InstrMode.Mask) { + unsigned Offset = countTrailingZeros(InstrMode.Mask); + unsigned Width = countTrailingOnes(InstrMode.Mask >> Offset); + unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1); + BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(Value) + .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) | + (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) | + (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_)); + ++NumSetregInserted; + InstrMode.Mask &= ~((1 << Width) - 1) << Offset; + } +} + +// In Phase 1 we iterate through the instructions of the block and for each +// instruction we get its mode usage. If the instruction uses the Mode register +// we: +// - update the Change status, which tracks the changes to the Mode register +// made by this block +// - if this instruction's requirements are compatible with the current setting +// of the Mode register we merge the modes +// - if it isn't compatible and an InsertionPoint isn't set, then we set the +// InsertionPoint to the current instruction, and we remember the current +// mode +// - if it isn't compatible and InsertionPoint is set we insert a seteg before +// that instruction (unless this instruction forms part of the block's +// entry requirements in which case the insertion is deferred until Phase 3 +// when predecessor exit values are known), and move the insertion point to +// this instruction +// - if this is a setreg instruction we treat it as an incompatible instruction. +// This is sub-optimal but avoids some nasty corner cases, and is expected to +// occur very rarely. +// - on exit we have set the Require, Change, and initial Exit modes. +void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { + auto NewInfo = llvm::make_unique(); + MachineInstr *InsertionPoint = nullptr; + // RequirePending is used to indicate whether we are collecting the initial + // requirements for the block, and need to defer the first InsertionPoint to + // Phase 3. It is set to false once we have set FirstInsertionPoint, or when + // we discover an explict setreg that means this block doesn't have any + // initial requirements. + bool RequirePending = true; + Status IPChange; + for (MachineInstr &MI : MBB) { + Status InstrMode = getInstructionMode(MI, TII); + if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) || + (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) { + // We preserve any explicit mode register setreg instruction we encounter, + // as we assume it has been inserted by a higher authority (this is + // likely to be a very rare occurrence). + unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); + if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) != + AMDGPU::Hwreg::ID_MODE) + continue; + + unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >> + AMDGPU::Hwreg::WIDTH_M1_SHIFT_) + + 1; + unsigned Offset = + (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_; + unsigned Mask = ((1 << Width) - 1) << Offset; + + // If an InsertionPoint is set we will insert a setreg there. + if (InsertionPoint) { + insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change)); + InsertionPoint = nullptr; + } + // If this is an immediate then we know the value being set, but if it is + // not an immediate then we treat the modified bits of the mode register + // as unknown. + if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) { + unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::imm)->getImm(); + unsigned Mode = (Val << Offset) & Mask; + Status Setreg = Status(Mask, Mode); + // If we haven't already set the initial requirements for the block we + // don't need to as the requirements start from this explicit setreg. + RequirePending = false; + NewInfo->Change = NewInfo->Change.merge(Setreg); + } else { + NewInfo->Change = NewInfo->Change.mergeUnknown(Mask); + } + } else if (!NewInfo->Change.isCompatible(InstrMode)) { + // This instruction uses the Mode register and its requirements aren't + // compatible with the current mode. + if (InsertionPoint) { + // If the required mode change cannot be included in the current + // InsertionPoint changes, we need a setreg and start a new + // InsertionPoint. + if (!IPChange.delta(NewInfo->Change).isCombinable(InstrMode)) { + if (RequirePending) { + // This is the first insertionPoint in the block so we will defer + // the insertion of the setreg to Phase 3 where we know whether or + // not it is actually needed. + NewInfo->FirstInsertionPoint = InsertionPoint; + NewInfo->Require = NewInfo->Change; + RequirePending = false; + } else { + insertSetreg(MBB, InsertionPoint, TII, + IPChange.delta(NewInfo->Change)); + IPChange = NewInfo->Change; + } + // Set the new InsertionPoint + InsertionPoint = &MI; + } + NewInfo->Change = NewInfo->Change.merge(InstrMode); + } else { + // No InsertionPoint is currently set - this is either the first in + // the block or we have previously seen an explicit setreg. + InsertionPoint = &MI; + IPChange = NewInfo->Change; + NewInfo->Change = NewInfo->Change.merge(InstrMode); + } + } + } + if (RequirePending) { + // If we haven't yet set the initial requirements for the block we set them + // now. + NewInfo->FirstInsertionPoint = InsertionPoint; + NewInfo->Require = NewInfo->Change; + } else if (InsertionPoint) { + // We need to insert a setreg at the InsertionPoint + insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change)); + } + NewInfo->Exit = NewInfo->Change; + BlockInfo[MBB.getNumber()] = std::move(NewInfo); +} + +// In Phase 2 we revisit each block and calculate the common Mode register +// value provided by all predecessor blocks. If the Exit value for the block +// is changed, then we add the successor blocks to the worklist so that the +// exit value is propagated. +void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { +// BlockData *BI = BlockInfo[MBB.getNumber()]; + unsigned ThisBlock = MBB.getNumber(); + if (MBB.pred_empty()) { + // There are no predecessors, so use the default starting status. + BlockInfo[ThisBlock]->Pred = DefaultStatus; + } else { + // Build a status that is common to all the predecessors by intersecting + // all the predecessor exit status values. + MachineBasicBlock::pred_iterator P = MBB.pred_begin(), E = MBB.pred_end(); + MachineBasicBlock &PB = *(*P); + BlockInfo[ThisBlock]->Pred = BlockInfo[PB.getNumber()]->Exit; + + for (P = std::next(P); P != E; P = std::next(P)) { + MachineBasicBlock *Pred = *P; + BlockInfo[ThisBlock]->Pred = BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[Pred->getNumber()]->Exit); + } + } + Status TmpStatus = BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change); + if (BlockInfo[ThisBlock]->Exit != TmpStatus) { + BlockInfo[ThisBlock]->Exit = TmpStatus; + // Add the successors to the work list so we can propagate the changed exit + // status. + for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(), + E = MBB.succ_end(); + S != E; S = std::next(S)) { + MachineBasicBlock &B = *(*S); + Phase2List.push(&B); + } + } +} + +// In Phase 3 we revisit each block and if it has an insertion point defined we +// check whether the predecessor mode meets the block's entry requirements. If +// not we insert an appropriate setreg instruction to modify the Mode register. +void SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { +// BlockData *BI = BlockInfo[MBB.getNumber()]; + unsigned ThisBlock = MBB.getNumber(); + if (!BlockInfo[ThisBlock]->Pred.isCompatible(BlockInfo[ThisBlock]->Require)) { + Status Delta = BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require); + if (BlockInfo[ThisBlock]->FirstInsertionPoint) + insertSetreg(MBB, BlockInfo[ThisBlock]->FirstInsertionPoint, TII, Delta); + else + insertSetreg(MBB, &MBB.instr_front(), TII, Delta); + } +} + +bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) { + BlockInfo.resize(MF.getNumBlockIDs()); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + // Processing is performed in a number of phases + + // Phase 1 - determine the initial mode required by each block, and add setreg + // instructions for intra block requirements. + for (MachineBasicBlock &BB : MF) + processBlockPhase1(BB, TII); + + // Phase 2 - determine the exit mode from each block. We add all blocks to the + // list here, but will also add any that need to be revisited during Phase 2 + // processing. + for (MachineBasicBlock &BB : MF) + Phase2List.push(&BB); + while (!Phase2List.empty()) { + processBlockPhase2(*Phase2List.front(), TII); + Phase2List.pop(); + } + + // Phase 3 - add an initial setreg to each block where the required entry mode + // is not satisfied by the exit mode of all its predecessors. + for (MachineBasicBlock &BB : MF) + processBlockPhase3(BB, TII); + + BlockInfo.clear(); + + return NumSetregInserted > 0; +} diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 9da99d9..68446ab 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -179,7 +179,9 @@ defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>; defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>; defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>; defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>; +let FPDPRounding = 1 in { defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>; +} // End FPDPRounding = 1 defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>; defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; @@ -232,7 +234,9 @@ defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>; let SchedRW = [WriteDoubleAdd] in { defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>; defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>; +let FPDPRounding = 1 in { defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>; +} // End FPDPRounding = 1 } // End SchedRW = [WriteDoubleAdd] defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>; @@ -339,8 +343,10 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>; let SubtargetPredicate = Has16BitInsts in { +let FPDPRounding = 1 in { defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>; defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>; +} // End FPDPRounding = 1 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>; defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>; let SchedRW = [WriteQuarterRate32] in { @@ -358,7 +364,9 @@ defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>; defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>; defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>; defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>; +let FPDPRounding = 1 in { defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>; +} // End FPDPRounding = 1 } diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 1bea9c3..e3fd7b5 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -555,18 +555,23 @@ def : divergent_i64_BinOp ; let SubtargetPredicate = Has16BitInsts in { +let FPDPRounding = 1 in { def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; +defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; +} // End FPDPRounding = 1 + defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>; defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>; -defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; let isCommutable = 1 in { +let FPDPRounding = 1 in { defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>; defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>; defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">; defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; +} // End FPDPRounding = 1 defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 1f88a24..4b8c1f2 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -220,7 +220,8 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { // VOP3 INTERP //===----------------------------------------------------------------------===// -class VOP3Interp : VOP3_Pseudo { +class VOP3Interp pattern = []> : + VOP3_Pseudo { let AsmMatchConverter = "cvtVOP3Interp"; } @@ -292,9 +293,11 @@ def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile, fma>; def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile, int_amdgcn_lerp>; let SchedRW = [WriteDoubleAdd] in { +let FPDPRounding = 1 in { def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile, fma>; def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile, fadd, 1>; def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile, fmul, 1>; +} // End FPDPRounding = 1 def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile, fminnum_like, 1>; def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile, fmaxnum_like, 1>; } // End SchedRW = [WriteDoubleAdd] @@ -324,6 +327,7 @@ def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, getVOP3VCC.ret> { let SchedRW = [WriteDouble]; + let FPDPRounding = 1; } } // End Uses = [VCC, EXEC] @@ -354,10 +358,10 @@ def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile, int_amdgcn_cvt_pk_u8_f32>; def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile, AMDGPUdiv_fixup>; -let SchedRW = [WriteDoubleAdd] in { +let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile, AMDGPUdiv_fixup>; def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile, AMDGPUldexp, 1>; -} // End SchedRW = [WriteDoubleAdd] +} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> { let SchedRW = [WriteFloatFMA, WriteSALU]; @@ -368,6 +372,7 @@ def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> { let SchedRW = [WriteDouble, WriteSALU]; let AsmMatchConverter = ""; + let FPDPRounding = 1; } def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile>; @@ -431,39 +436,51 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile, AMDGPUdiv_fixup> { let Predicates = [Has16BitInsts, isVIOnly]; + let FPDPRounding = 1; } def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", VOP3_Profile, AMDGPUdiv_fixup> { let renamedInGFX9 = 1; let Predicates = [Has16BitInsts, isGFX9]; + let FPDPRounding = 1; } def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile, fma> { let Predicates = [Has16BitInsts, isVIOnly]; + let FPDPRounding = 1; } def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile, fma> { let renamedInGFX9 = 1; let Predicates = [Has16BitInsts, isGFX9]; + let FPDPRounding = 1; } let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in { let renamedInGFX9 = 1 in { -def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile, fmad>; def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>; def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>; +let FPDPRounding = 1 in { +def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile, fmad>; +let Uses = [M0, EXEC] in { def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>; -} +} // End Uses = [M0, EXEC] +} // End FPDPRounding = 1 +} // End renamedInGFX9 = 1 let SubtargetPredicate = isGFX9 in { -def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile>; +def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile> { + let FPDPRounding = 1; +} def V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile>; def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile>; def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; } // End SubtargetPredicate = isGFX9 +let Uses = [M0, EXEC], FPDPRounding = 1 in { def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>; def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; +} // End Uses = [M0, EXEC], FPDPRounding = 1 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 @@ -845,12 +862,15 @@ defm V_FMA_F16 : VOP3_F16_Real_vi <0x1ee>; defm V_DIV_FIXUP_F16 : VOP3_F16_Real_vi <0x1ef>; defm V_INTERP_P2_F16 : VOP3Interp_F16_Real_vi <0x276>; +let FPDPRounding = 1 in { defm V_MAD_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ea, "V_MAD_F16", "v_mad_legacy_f16">; -defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; -defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; defm V_FMA_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ee, "V_FMA_F16", "v_fma_legacy_f16">; defm V_DIV_FIXUP_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ef, "V_DIV_FIXUP_F16", "v_div_fixup_legacy_f16">; defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16", "v_interp_p2_legacy_f16">; +} // End FPDPRounding = 1 + +defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; +defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">; defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 2efd28b..0d25a86 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -42,12 +42,14 @@ class VOP3_VOP3PInst, fma>; def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile>; def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile>; +let FPDPRounding = 1 in { +def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile, fma>; def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile, fadd>; def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile, fmul>; +} // End FPDPRounding = 1 def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile, fmaxnum_like>; def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile, fminnum_like>; @@ -137,12 +139,14 @@ let SubtargetPredicate = HasMadMixInsts in { let isCommutable = 1 in { def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile>; +let FPDPRounding = 1 in { // Clamp modifier is applied after conversion to f16. def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile, 1>; let ClampLo = 0, ClampHi = 1 in { def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile, 1>; } +} // End FPDPRounding = 1 } defm : MadFmaMixPats; @@ -154,12 +158,14 @@ let SubtargetPredicate = HasFmaMixInsts in { let isCommutable = 1 in { def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile>; +let FPDPRounding = 1 in { // Clamp modifier is applied after conversion to f16. def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile, 1>; let ClampLo = 0, ClampHi = 1 in { def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile, 1>; } +} // End FPDPRounding = 1 } defm : MadFmaMixPats; diff --git a/llvm/test/CodeGen/AMDGPU/mode-register.mir b/llvm/test/CodeGen/AMDGPU/mode-register.mir new file mode 100644 index 0000000..bf06a01 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mode-register.mir @@ -0,0 +1,459 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-mode-register %s -o - | FileCheck %s + +--- +# check that the mode is changed to rtz from default rtn for interp f16 +# CHECK-LABEL: name: interp_f16_default +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NEXT: V_INTERP_P1LL_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK-NEXT: V_ADD_F16_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: interp_f16_default + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that the mode is not changed for interp f16 when the mode is already RTZ +# CHECK-LABEL: name: interp_f16_explicit_rtz +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NEXT: V_MOV_B32_e32 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK-NEXT: V_ADD_F16_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: interp_f16_explicit_rtz + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + $m0 = S_MOV_B32 killed $sgpr2 + S_SETREG_IMM32_B32 3, 2177 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that explicit RTN mode change is registered +# CHECK-LABEL: name: explicit_rtn +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NEXT: V_INTERP_P1LL_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK-NEXT: V_ADD_F16_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: explicit_rtn + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + S_SETREG_IMM32_B32 0, 2177 + $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that the mode is unchanged from RTN for F64 instruction +# CHECK-LABEL: name: rtn_default +# CHECK-LABEL: bb.0: +# CHECK-NOT: S_SETREG_IMM32_B32 +# CHECK: V_FRACT_F64 + +name: rtn_default + +body: | + bb.0: + liveins: $vgpr1_vgpr2 + $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + S_ENDPGM +... +--- +# check that the mode is changed from RTZ to RTN for F64 instruction +# CHECK-LABEL: name: rtn_from_rtz +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NEXT: S_SETREG_IMM32_B32 0, 2177 +# CHECK-NEXT: V_FRACT_F64 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: rtn_from_rtz + +body: | + bb.0: + liveins: $vgpr1_vgpr2 + S_SETREG_IMM32_B32 3, 2177 + $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + S_ENDPGM +... +--- +# CHECK-LABEL: name: rtz_from_rtn +# CHECK-LABEL: bb.1: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: rtz_from_rtn + +body: | + bb.0: + successors: %bb.1 + liveins: $vgpr1_vgpr2 + $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_ENDPGM +... +--- +# check that the mode is changed from RTZ to RTN for F64 instruction +# and back again for remaining interp instruction +# CHECK-LABEL: name: interp_f16_plus_sqrt_f64 +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P2_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64 +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P2_F16 + +name: interp_f16_plus_sqrt_f64 + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that an explicit change to the single precision mode has no effect +# CHECK-LABEL: name: single_precision_mode_change +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P2_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64 +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P2_F16 + +name: single_precision_mode_change + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_SETREG_IMM32_B32 2, 2049 + $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that mode is propagated back to start of loop - first instruction is RTN but needs +# setreg as RTZ is set in loop +# CHECK-LABEL: name: loop +# CHECK-LABEL: bb.1: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64 +# CHECK-LABEL: bb.2: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: loop + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.1, %bb.3 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.3 + + bb.3: + S_ENDPGM +... +--- +# two back-edges to same node with different modes +# CHECK-LABEL: name: double_loop +# CHECK-NOT: S_SETREG_IMM32_B32 +# CHECK-LABEL: bb.2: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64_e32 +# CHECK-LABEL: bb.4: +# CHECK: S_SETREG_IMM32_B32 3, 2177 + +name: double_loop + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_NOP 1 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.1, %bb.3 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + S_NOP 1 + S_BRANCH %bb.4 + + bb.4: + successors: %bb.5 + S_NOP 1 + S_BRANCH %bb.5 + + bb.5: + successors: %bb.1, %bb.6 + S_SETREG_IMM32_B32 3, 2177 + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.6 + + bb.6: + S_ENDPGM +... +--- +# check that mode is propagated back to start of loop and through a block that +# neither sets or uses the mode. +# CHECK-LABEL: name: loop_indirect +# CHECK_NOT: S_SETREG_IMM32_B32 +# CHECK-LABEL: bb.3: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: loop_indirect + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_NOP 1 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + S_NOP 1 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.1, %bb.4 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.4 + + bb.4: + S_ENDPGM +... +--- +# check that multiple mode values are propagated to a block that uses the mode +# CHECK-LABEL: name: multiple_mode_direct +# CHECK-LABEL: bb.3: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: multiple_mode_direct + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + S_CBRANCH_VCCZ %bb.2, implicit $vcc + S_BRANCH %bb.3 + + bb.2: + successors: %bb.3 + S_SETREG_IMM32_B32 3, 2177 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.4 + + bb.4: + S_ENDPGM +... +--- +# check that multiple mode values are propagated through a block that neither +# sets or uses the mode. +# CHECK-LABEL: name: multiple_mode_indirect +# CHECK-LABEL: bb.4: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: multiple_mode_indirect + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + S_CBRANCH_VCCZ %bb.2, implicit $vcc + S_BRANCH %bb.3 + + bb.2: + successors: %bb.3 + S_SETREG_IMM32_B32 3, 2177 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + S_NOP 1 + S_BRANCH %bb.4 + + bb.4: + successors: %bb.5 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.5 + + bb.5: + S_ENDPGM +... +--- +# CHECK-LABEL: name: pass_through_blocks +# CHECK-LABEL: bb.0: +# CHECK: V_FRACT_F64_e32 +# CHECK-NEXT: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: pass_through_blocks + +body: | + bb.0: + successors: %bb.1 + liveins: $vgpr1_vgpr2 + $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + S_BRANCH %bb.4 + + bb.4: + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_ENDPGM +... +--- +# check that multiple mode values are propagated +# CHECK-LABEL: name: if_then_else +# CHECK-LABEL: bb.3: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: if_then_else + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + S_CBRANCH_VCCZ %bb.3, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + S_SETREG_IMM32_B32 3, 2177 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.4 + + bb.4: + S_ENDPGM +... -- 2.7.4