setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);
+ setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SMIN);
setTargetDAGCombine(ISD::SMAX);
setTargetDAGCombine(ISD::UMIN);
case Intrinsic::amdgcn_fmed3:
return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::amdgcn_fdot2:
+ return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_fmul_legacy:
return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
Op.getOperand(1), Op.getOperand(2));
return SDValue();
}
+SDValue SITargetLowering::performFMACombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDLoc SL(N);
+
+ if (!Subtarget->hasDLInsts() || VT != MVT::f32)
+ return SDValue();
+
+ // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
+ // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
+ SDValue Op1 = N->getOperand(0);
+ SDValue Op2 = N->getOperand(1);
+ SDValue FMA = N->getOperand(2);
+
+ if (FMA.getOpcode() != ISD::FMA ||
+ Op1.getOpcode() != ISD::FP_EXTEND ||
+ Op2.getOpcode() != ISD::FP_EXTEND)
+ return SDValue();
+
+ // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
+ // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
+ // is sufficient to allow generaing fdot2.
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+ (N->getFlags().hasAllowContract() &&
+ FMA->getFlags().hasAllowContract())) {
+ Op1 = Op1.getOperand(0);
+ Op2 = Op2.getOperand(0);
+ if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ SDValue Vec1 = Op1.getOperand(0);
+ SDValue Idx1 = Op1.getOperand(1);
+ SDValue Vec2 = Op2.getOperand(0);
+
+ SDValue FMAOp1 = FMA.getOperand(0);
+ SDValue FMAOp2 = FMA.getOperand(1);
+ SDValue FMAAcc = FMA.getOperand(2);
+
+ if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
+ FMAOp2.getOpcode() != ISD::FP_EXTEND)
+ return SDValue();
+
+ FMAOp1 = FMAOp1.getOperand(0);
+ FMAOp2 = FMAOp2.getOperand(0);
+ if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ SDValue Vec3 = FMAOp1.getOperand(0);
+ SDValue Vec4 = FMAOp2.getOperand(0);
+ SDValue Idx2 = FMAOp1.getOperand(1);
+
+ if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
+ // Idx1 and Idx2 cannot be the same.
+ Idx1 == Idx2)
+ return SDValue();
+
+ if (Vec1 == Vec2 || Vec3 == Vec4)
+ return SDValue();
+
+ if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
+ return SDValue();
+
+ if ((Vec1 == Vec3 && Vec2 == Vec4) ||
+ (Vec1 == Vec4 && Vec2 == Vec3))
+ return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc);
+ }
+ return SDValue();
+}
+
SDValue SITargetLowering::performSetCCCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
return performMinMaxCombine(N, DCI);
break;
}
+ case ISD::FMA:
+ return performFMACombine(N, DCI);
case ISD::LOAD: {
if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
return Widended;
--- /dev/null
+; RUN: llc -march=amdgcn -mcpu=gfx900 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -march=amdgcn -mcpu=gfx906 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-UNSAFE
+; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906
+; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-fp64-fp16-denormals,-fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT
+; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+fp64-fp16-denormals,+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT
+; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z)
+
+; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions
+; are not converted from f16 to f32.
+; GCN-LABEL: {{^}}dotproduct_f16
+; GFX900: v_fma_legacy_f16
+; GCN900: v_fma_legacy_f16
+
+; GFX906: v_mul_f16_e32
+; GFX906: v_mul_f16_e32
+
+; GFX906-UNSAFE: v_fma_legacy_f16
+
+; GFX906-CONTRACT: v_mac_f16_e32
+; GFX906-DENORM-CONTRACT: v_fma_legacy_f16
+define amdgpu_kernel void @dotproduct_f16(<2 x half> addrspace(1)* %src1,
+ <2 x half> addrspace(1)* %src2,
+ half addrspace(1)* nocapture %dst) {
+entry:
+ %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
+ %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
+
+ %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
+ %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
+
+ %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
+ %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
+
+ %mul2 = fmul half %src1.el2, %src2.el2
+ %mul1 = fmul half %src1.el1, %src2.el1
+ %acc = load half, half addrspace(1)* %dst, align 2
+ %acc1 = fadd half %mul2, %acc
+ %acc2 = fadd half %mul1, %acc1
+ store half %acc2, half addrspace(1)* %dst, align 2
+ ret void
+}
+
+
+; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32
+; and the vectors are of type <2 x half>
+; GCN-LABEL: {{^}}dotproduct_f16_f32
+; GFX900: v_mad_mix_f32
+; GCN900: v_mad_mix_f32
+
+; GFX906: v_mad_f32
+; GFX906: v_mac_f32_e32
+
+; GFX906-UNSAFE: v_dot2_f32_f16
+
+; GFX906-CONTRACT: v_dot2_f32_f16
+
+; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
+define amdgpu_kernel void @dotproduct_f16_f32(<2 x half> addrspace(1)* %src1,
+ <2 x half> addrspace(1)* %src2,
+ float addrspace(1)* nocapture %dst) {
+entry:
+ %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
+ %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
+
+ %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
+ %csrc1.el1 = fpext half %src1.el1 to float
+ %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
+ %csrc2.el1 = fpext half %src2.el1 to float
+
+ %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
+ %csrc1.el2 = fpext half %src1.el2 to float
+ %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
+ %csrc2.el2 = fpext half %src2.el2 to float
+
+ %mul2 = fmul float %csrc1.el2, %csrc2.el2
+ %mul1 = fmul float %csrc1.el1, %csrc2.el1
+ %acc = load float, float addrspace(1)* %dst, align 4
+ %acc1 = fadd float %mul2, %acc
+ %acc2 = fadd float %mul1, %acc1
+ store float %acc2, float addrspace(1)* %dst, align 4
+ ret void
+}
+
+; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32
+; and the vectors are of type <2 x half>
+; GCN-LABEL: {{^}}dotproduct_diffvecorder
+; GFX900: v_mad_mix_f32
+; GCN900: v_mad_mix_f32
+
+; GFX906: v_mad_f32
+; GFX906: v_mac_f32_e32
+
+; GFX906-UNSAFE: v_dot2_f32_f16
+
+; GFX906-CONTRACT: v_dot2_f32_f16
+; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
+define amdgpu_kernel void @dotproduct_diffvecorder(<2 x half> addrspace(1)* %src1,
+ <2 x half> addrspace(1)* %src2,
+ float addrspace(1)* nocapture %dst) {
+entry:
+ %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
+ %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
+
+ %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
+ %csrc1.el1 = fpext half %src1.el1 to float
+ %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
+ %csrc2.el1 = fpext half %src2.el1 to float
+
+ %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
+ %csrc1.el2 = fpext half %src1.el2 to float
+ %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
+ %csrc2.el2 = fpext half %src2.el2 to float
+
+ %mul2 = fmul float %csrc2.el2, %csrc1.el2
+ %mul1 = fmul float %csrc1.el1, %csrc2.el1
+ %acc = load float, float addrspace(1)* %dst, align 4
+ %acc1 = fadd float %mul2, %acc
+ %acc2 = fadd float %mul1, %acc1
+ store float %acc2, float addrspace(1)* %dst, align 4
+ ret void
+}
+
+; Tests to make sure dot product is not generated when the vectors are not of <2 x half>.
+; GCN-LABEL: {{^}}dotproduct_v4f16
+; GFX900: v_mad_mix_f32
+
+; GFX906: v_mad_f32
+; GFX906: v_mac_f32_e32
+
+; GFX906-UNSAFE: v_fma_mix_f32
+
+; GFX906-CONTRACT: v_fma_mix_f32
+; GFX906-DENORM-CONTRACT: v_fma_mix_f32
+define amdgpu_kernel void @dotproduct_v4f16(<4 x half> addrspace(1)* %src1,
+ <4 x half> addrspace(1)* %src2,
+ float addrspace(1)* nocapture %dst) {
+entry:
+ %src1.vec = load <4 x half>, <4 x half> addrspace(1)* %src1
+ %src2.vec = load <4 x half>, <4 x half> addrspace(1)* %src2
+
+ %src1.el1 = extractelement <4 x half> %src1.vec, i64 0
+ %csrc1.el1 = fpext half %src1.el1 to float
+ %src2.el1 = extractelement <4 x half> %src2.vec, i64 0
+ %csrc2.el1 = fpext half %src2.el1 to float
+
+ %src1.el2 = extractelement <4 x half> %src1.vec, i64 1
+ %csrc1.el2 = fpext half %src1.el2 to float
+ %src2.el2 = extractelement <4 x half> %src2.vec, i64 1
+ %csrc2.el2 = fpext half %src2.el2 to float
+
+ %mul2 = fmul float %csrc1.el2, %csrc2.el2
+ %mul1 = fmul float %csrc1.el1, %csrc2.el1
+ %acc = load float, float addrspace(1)* %dst, align 4
+ %acc1 = fadd float %mul2, %acc
+ %acc2 = fadd float %mul1, %acc1
+ store float %acc2, float addrspace(1)* %dst, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}NotAdotproduct
+; GFX900: v_mad_mix_f32
+; GCN900: v_mad_mix_f32
+
+; GFX906: v_mad_f32
+; GFX906: v_mac_f32_e32
+
+; GFX906-UNSAFE: v_fma_mix_f32
+
+; GFX906-CONTRACT: v_fma_mix_f32
+; GFX906-DENORM-CONTRACT: v_fma_mix_f32
+define amdgpu_kernel void @NotAdotproduct(<2 x half> addrspace(1)* %src1,
+ <2 x half> addrspace(1)* %src2,
+ float addrspace(1)* nocapture %dst) {
+entry:
+ %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
+ %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
+
+ %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
+ %csrc1.el1 = fpext half %src1.el1 to float
+ %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
+ %csrc2.el1 = fpext half %src2.el1 to float
+
+ %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
+ %csrc1.el2 = fpext half %src1.el2 to float
+ %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
+ %csrc2.el2 = fpext half %src2.el2 to float
+
+ %mul2 = fmul float %csrc1.el2, %csrc1.el1
+ %mul1 = fmul float %csrc2.el1, %csrc2.el2
+ %acc = load float, float addrspace(1)* %dst, align 4
+ %acc1 = fadd float %mul2, %acc
+ %acc2 = fadd float %mul1, %acc1
+ store float %acc2, float addrspace(1)* %dst, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct
+; GFX900: v_mad_mix_f32
+; GCN900: v_mad_mix_f32
+
+; GFX906: v_mad_f32
+; GFX906: v_mac_f32_e32
+
+; GFX906-UNSAFE: v_fma_mix_f32
+
+; GFX906-CONTRACT: v_fma_mix_f32
+; GFX906-DENORM-CONTRACT: v_fma_mix_f32
+define amdgpu_kernel void @Diff_Idx_NotAdotproduct(<2 x half> addrspace(1)* %src1,
+ <2 x half> addrspace(1)* %src2,
+ float addrspace(1)* nocapture %dst) {
+entry:
+ %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
+ %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
+
+ %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
+ %csrc1.el1 = fpext half %src1.el1 to float
+ %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
+ %csrc2.el1 = fpext half %src2.el1 to float
+
+ %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
+ %csrc1.el2 = fpext half %src1.el2 to float
+ %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
+ %csrc2.el2 = fpext half %src2.el2 to float
+
+ %mul2 = fmul float %csrc1.el2, %csrc2.el1
+ %mul1 = fmul float %csrc1.el1, %csrc2.el2
+ %acc = load float, float addrspace(1)* %dst, align 4
+ %acc1 = fadd float %mul2, %acc
+ %acc2 = fadd float %mul1, %acc1
+ store float %acc2, float addrspace(1)* %dst, align 4
+ ret void
+}
\ No newline at end of file