void SelectADD_SUB_I64(SDNode *N);
void SelectDIV_SCALE(SDNode *N);
+ void SelectFMA_W_CHAIN(SDNode *N);
+ void SelectFMUL_W_CHAIN(SDNode *N);
SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
uint32_t Offset, uint32_t Width);
SelectADD_SUB_I64(N);
return;
}
+ case AMDGPUISD::FMUL_W_CHAIN: {
+ SelectFMUL_W_CHAIN(N);
+ return;
+ }
+ case AMDGPUISD::FMA_W_CHAIN: {
+ SelectFMA_W_CHAIN(N);
+ return;
+ }
+
case ISD::SCALAR_TO_VECTOR:
case AMDGPUISD::BUILD_VERTICAL_VECTOR:
case ISD::BUILD_VECTOR: {
CurDAG->RemoveDeadNode(N);
}
+void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
+ SDLoc SL(N);
+ // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+ SDValue Ops[10];
+
+ SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
+ SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
+ SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
+ Ops[8] = N->getOperand(0);
+ Ops[9] = N->getOperand(4);
+
+ CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops);
+}
+
+void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
+ SDLoc SL(N);
+ // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
+ SDValue Ops[8];
+
+ SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
+ SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
+ Ops[6] = N->getOperand(0);
+ Ops[7] = N->getOperand(3);
+
+ CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
+}
+
// We need to handle this here because tablegen doesn't support matching
// instructions with multiple outputs.
void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
NODE_NAME_CASE(DWORDADDR)
NODE_NAME_CASE(FRACT)
NODE_NAME_CASE(SETCC)
+ NODE_NAME_CASE(SETREG)
+ NODE_NAME_CASE(FMA_W_CHAIN)
+ NODE_NAME_CASE(FMUL_W_CHAIN)
NODE_NAME_CASE(CLAMP)
NODE_NAME_CASE(COS_HW)
NODE_NAME_CASE(SIN_HW)
// This is SETCC with the full mask result which is used for a compare with a
// result bit per item in the wavefront.
SETCC,
+ SETREG,
+ // FP ops with input and output chain.
+ FMA_W_CHAIN,
+ FMUL_W_CHAIN,
// SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
// Denormals handled on some parts.
def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
+def AMDGPUSetRegOp : SDTypeProfile<0, 2, [
+ SDTCisInt<0>, SDTCisInt<1>
+]>;
+
+def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [
+ SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [
+ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [
+ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
SDTIntToFPOp, []>;
def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
MI->setDesc(TII->get(Opc));
}
+ // Special case for s_setreg_b32
+ if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
+ MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
+ FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+ return true;
+ }
+
// If we are already folding into another operand of MI, then
// we can't commute the instruction, otherwise we risk making the
// other fold illegal.
#include "AMDGPU.h"
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
#include "SIISelLowering.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
return SDValue();
}
+static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
+ EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
+ if (GlueChain->getNumValues() <= 1) {
+ return DAG.getNode(Opcode, SL, VT, A, B);
+ }
+
+ assert(GlueChain->getNumValues() == 3);
+
+ SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
+ switch (Opcode) {
+ default: llvm_unreachable("no chain equivalent for opcode");
+ case ISD::FMUL:
+ Opcode = AMDGPUISD::FMUL_W_CHAIN;
+ break;
+ }
+
+ return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
+ GlueChain.getValue(2));
+}
+
+static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
+ EVT VT, SDValue A, SDValue B, SDValue C,
+ SDValue GlueChain) {
+ if (GlueChain->getNumValues() <= 1) {
+ return DAG.getNode(Opcode, SL, VT, A, B, C);
+ }
+
+ assert(GlueChain->getNumValues() == 3);
+
+ SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
+ switch (Opcode) {
+ default: llvm_unreachable("no chain equivalent for opcode");
+ case ISD::FMA:
+ Opcode = AMDGPUISD::FMA_W_CHAIN;
+ break;
+ }
+
+ return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
+ GlueChain.getValue(2));
+}
+
// Faster 2.5 ULP division that does not support denormals.
SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
- SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS);
- SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS);
+ SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
+ RHS, RHS, LHS);
+ SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
+ LHS, RHS, LHS);
// Denominator is scaled to not be denormal, so using rcp is ok.
- SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled);
+ SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
+ DenominatorScaled);
+ SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
+ DenominatorScaled);
+
+ const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
+ (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
+ (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+
+ const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
+
+ if (!Subtarget->hasFP32Denormals()) {
+ SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
+ SL, MVT::i32);
+ SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
+ DAG.getEntryNode(),
+ EnableDenormValue, BitField);
+ SDValue Ops[3] = {
+ NegDivScale0,
+ EnableDenorm.getValue(0),
+ EnableDenorm.getValue(1)
+ };
+
+ NegDivScale0 = DAG.getMergeValues(Ops, SL);
+ }
+
+ SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
+ ApproxRcp, One, NegDivScale0);
+
+ SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
+ ApproxRcp, Fma0);
- SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled);
+ SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
+ Fma1, Fma1);
- SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One);
- SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp);
+ SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
+ NumeratorScaled, Mul);
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1);
+ SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
- SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled);
- SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul);
- SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled);
+ SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
+ NumeratorScaled, Fma3);
+
+ if (!Subtarget->hasFP32Denormals()) {
+ const SDValue DisableDenormValue =
+ DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
+ SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
+ Fma4.getValue(1),
+ DisableDenormValue,
+ BitField,
+ Fma4.getValue(2));
+
+ SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+ DisableDenorm, DAG.getRoot());
+ DAG.setRoot(OutputChain);
+ }
SDValue Scale = NumeratorScaled.getValue(1);
- SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale);
+ SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
+ Fma4, Fma1, Fma3, Scale);
return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
}
// boundaries prevents incorrect movements of such instructions.
return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
+ MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
+ MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
changesVGPRIndexingMode(MI);
}
>;
}
+let hasSideEffects = 1 in {
+
def S_SETREG_B32 : SOPK_Pseudo <
"s_setreg_b32",
(outs), (ins SReg_32:$sdst, hwreg:$simm16),
- "$simm16, $sdst"
+ "$simm16, $sdst",
+ [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))]
>;
// FIXME: Not on SI?
let has_sdst = 0;
}
+} // End hasSideEffects = 1
//===----------------------------------------------------------------------===//
// SOPC Instructions
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; These tests check that fdiv is expanded correctly and also test that the
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
-; SI: v_div_scale_f32
-; SI-DAG: v_div_scale_f32
-
-; SI-DAG: v_rcp_f32
-; SI: v_fma_f32
-; SI: v_fma_f32
-; SI: v_mul_f32
-; SI: v_fma_f32
-; SI: v_fma_f32
-; SI: v_fma_f32
-; SI: v_div_fmas_f32
-; SI: v_div_fixup_f32
+; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
+; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
+
+; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
+; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
+; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
+; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
+; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
+; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
+; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
+; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
entry:
%fdiv = fdiv float %a, %b
ret void
}
+; FUNC-LABEL: {{^}}fdiv_f32_denormals:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
+
+; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
+; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
+
+; GCN-NOT: s_setreg
+; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
+; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
+; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
+; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
+; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
+; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
+; GCN-NOT: s_setreg
+; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
+; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
+define void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+entry:
+ %fdiv = fdiv float %a, %b
+ store float %fdiv, float addrspace(1)* %out
+ ret void
+}
+
; FUNC-LABEL: {{^}}fdiv_25ulp_f32:
-; SI: v_cndmask_b32
-; SI: v_mul_f32
-; SI: v_rcp_f32
-; SI: v_mul_f32
-; SI: v_mul_f32
+; GCN: v_cndmask_b32
+; GCN: v_mul_f32
+; GCN: v_rcp_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
entry:
%fdiv = fdiv float %a, %b, !fpmath !0
; Use correct fdiv
; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32:
-; SI: v_fma_f32
-; SI: v_div_fmas_f32
-; SI: v_div_fixup_f32
+; GCN: v_fma_f32
+; GCN: v_div_fmas_f32
+; GCN: v_div_fixup_f32
define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
entry:
%fdiv = fdiv float %a, %b, !fpmath !0
}
; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
-; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
-; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
-; SI-NOT: [[RESULT]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
entry:
%fdiv = fdiv fast float %a, %b
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
-; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
-; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
-; SI-NOT: [[RESULT]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
entry:
%fdiv = fdiv fast float %a, %b
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
-; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
-; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
-; SI-NOT: [[RESULT]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
entry:
%fdiv = fdiv arcp float %a, %b
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
-; SI: v_div_scale_f32
-; SI: v_div_scale_f32
-; SI: v_div_scale_f32
-; SI: v_div_scale_f32
+; GCN: v_div_scale_f32
+; GCN: v_div_scale_f32
+; GCN: v_div_scale_f32
+; GCN: v_div_scale_f32
define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
entry:
%fdiv = fdiv <2 x float> %a, %b
}
; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
-; SI: v_cmp_gt_f32
-; SI: v_cmp_gt_f32
+; GCN: v_cmp_gt_f32
+; GCN: v_cmp_gt_f32
define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
entry:
%fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
-; SI: v_rcp_f32
-; SI: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
entry:
%fdiv = fdiv fast <2 x float> %a, %b
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
-; SI: v_rcp_f32
-; SI: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
entry:
%fdiv = fdiv arcp <2 x float> %a, %b
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; SI: v_div_fixup_f32
-; SI: v_div_fixup_f32
-; SI: v_div_fixup_f32
-; SI: v_div_fixup_f32
+; GCN: v_div_fixup_f32
+; GCN: v_div_fixup_f32
+; GCN: v_div_fixup_f32
+; GCN: v_div_fixup_f32
define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
%a = load <4 x float>, <4 x float> addrspace(1) * %in
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; SI: v_rcp_f32
-; SI: v_rcp_f32
-; SI: v_rcp_f32
-; SI: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
%a = load <4 x float>, <4 x float> addrspace(1) * %in
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; SI: v_rcp_f32
-; SI: v_rcp_f32
-; SI: v_rcp_f32
-; SI: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
%a = load <4 x float>, <4 x float> addrspace(1) * %in