namespace {
-// Instructions that will be lowered with a final instruction that zeros the
-// high result bits.
-// XXX - only need to list legal operations.
-static bool fp16SrcZerosHighBits(unsigned Opc) {
- switch (Opc) {
- case ISD::FADD:
- case ISD::FSUB:
- case ISD::FMUL:
- case ISD::FDIV:
- case ISD::FREM:
- case ISD::FMA:
- case ISD::FMAD:
- case ISD::FCANONICALIZE:
- case ISD::FP_ROUND:
- case ISD::UINT_TO_FP:
- case ISD::SINT_TO_FP:
- case ISD::FABS:
- // Fabs is lowered to a bit operation, but it's an and which will clear the
- // high bits anyway.
- case ISD::FSQRT:
- case ISD::FSIN:
- case ISD::FCOS:
- case ISD::FPOWI:
- case ISD::FPOW:
- case ISD::FLOG:
- case ISD::FLOG2:
- case ISD::FLOG10:
- case ISD::FEXP:
- case ISD::FEXP2:
- case ISD::FCEIL:
- case ISD::FTRUNC:
- case ISD::FRINT:
- case ISD::FNEARBYINT:
- case ISD::FROUND:
- case ISD::FFLOOR:
- case ISD::FMINNUM:
- case ISD::FMAXNUM:
- case AMDGPUISD::FRACT:
- case AMDGPUISD::CLAMP:
- case AMDGPUISD::COS_HW:
- case AMDGPUISD::SIN_HW:
- case AMDGPUISD::FMIN3:
- case AMDGPUISD::FMAX3:
- case AMDGPUISD::FMED3:
- case AMDGPUISD::FMAD_FTZ:
- case AMDGPUISD::RCP:
- case AMDGPUISD::RSQ:
- case AMDGPUISD::RCP_IFLAG:
- case AMDGPUISD::LDEXP:
- return true;
- default:
- // fcopysign, select and others may be lowered to 32-bit bit operations
- // which don't zero the high bits.
- return false;
- }
-}
-
static bool isNullConstantOrUndef(SDValue V) {
if (V.isUndef())
return true;
bool EnableLateStructurizeCFG;
+ // Instructions that will be lowered with a final instruction that zeros the
+ // high result bits.
+ bool fp16SrcZerosHighBits(unsigned Opc) const;
+
public:
explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
return SelectionDAGISel::runOnMachineFunction(MF);
}
+bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
+ // XXX - only need to list legal operations.
+ switch (Opc) {
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FDIV:
+ case ISD::FREM:
+ case ISD::FCANONICALIZE:
+ case ISD::UINT_TO_FP:
+ case ISD::SINT_TO_FP:
+ case ISD::FABS:
+ // Fabs is lowered to a bit operation, but it's an and which will clear the
+ // high bits anyway.
+ case ISD::FSQRT:
+ case ISD::FSIN:
+ case ISD::FCOS:
+ case ISD::FPOWI:
+ case ISD::FPOW:
+ case ISD::FLOG:
+ case ISD::FLOG2:
+ case ISD::FLOG10:
+ case ISD::FEXP:
+ case ISD::FEXP2:
+ case ISD::FCEIL:
+ case ISD::FTRUNC:
+ case ISD::FRINT:
+ case ISD::FNEARBYINT:
+ case ISD::FROUND:
+ case ISD::FFLOOR:
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case AMDGPUISD::FRACT:
+ case AMDGPUISD::CLAMP:
+ case AMDGPUISD::COS_HW:
+ case AMDGPUISD::SIN_HW:
+ case AMDGPUISD::FMIN3:
+ case AMDGPUISD::FMAX3:
+ case AMDGPUISD::FMED3:
+ case AMDGPUISD::FMAD_FTZ:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RSQ:
+ case AMDGPUISD::RCP_IFLAG:
+ case AMDGPUISD::LDEXP:
+ // On gfx10, all 16-bit instructions preserve the high bits.
+ return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
+ case ISD::FP_ROUND:
+ // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
+ // high bits on gfx9.
+ // TODO: If we had the source node we could see if the source was fma/mad
+ return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ case ISD::FMA:
+ case ISD::FMAD:
+ case AMDGPUISD::DIV_FIXUP:
+ return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ default:
+ // fcopysign, select and others may be lowered to 32-bit bit operations
+ // which don't zero the high bits.
+ return false;
+ }
+}
+
bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
assert(Subtarget->d16PreservesUnusedBits());
MVT VT = N->getValueType(0).getSimpleVT();
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX906 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; GCN-LABEL: {{^}}shl_i16:
%zext = zext i16 %res to i32
ret i32 %zext
}
+
+; GCN-LABEL: {{^}}zext_fadd_f16:
+; GFX8: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1
+; GFX9-NEXT: s_setpc_b64
+
+; GFX10: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, [[ADD]]
+define i32 @zext_fadd_f16(half %x, half %y) {
+ %add = fadd half %x, %y
+ %cast = bitcast half %add to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}zext_fma_f16:
+; GFX8: v_fma_f16 [[FMA:v[0-9]+]], v0, v1, v2
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_fma_f16 [[FMA:v[0-9]+]], v0, v1, v2
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, [[FMA]]
+
+; GFX10: v_fmac_f16_e32 [[FMA:v[0-9]+]], v0, v1
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, [[FMA]]
+define i32 @zext_fma_f16(half %x, half %y, half %z) {
+ %fma = call half @llvm.fma.f16(half %x, half %y, half %z)
+ %cast = bitcast half %fma to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}zext_div_fixup_f16:
+; GFX8: v_div_fixup_f16 v0, v0, v1, v2
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_div_fixup_f16 v0, v0, v1, v2
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX10: v_div_fixup_f16 v0, v0, v1, v2
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+define i32 @zext_div_fixup_f16(half %x, half %y, half %z) {
+ %div.fixup = call half @llvm.amdgcn.div.fixup.f16(half %x, half %y, half %z)
+ %cast = bitcast half %div.fixup to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+; We technically could eliminate the and on gfx9 here but we don't try
+; to inspect the source of the fptrunc. We're only worried about cases
+; that lower to v_fma_mix* instructions.
+
+; GCN-LABEL: {{^}}zext_fptrunc_f16:
+; GFX8: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX10: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+define i32 @zext_fptrunc_f16(float %x) {
+ %fptrunc = fptrunc float %x to half
+ %cast = bitcast half %fptrunc to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}zext_fptrunc_fma_f16:
+; GFX900: v_fma_f32 v0, v0, v1, v2
+; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX906: v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX10: v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) {
+ %fma = call float @llvm.fma.f32(float %x, float %y, float %z)
+ %fptrunc = fptrunc float %fma to half
+ %cast = bitcast half %fptrunc to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+declare half @llvm.amdgcn.div.fixup.f16(half, half, half)
+declare half @llvm.fma.f16(half, half, half)
+declare float @llvm.fma.f32(float, float, float)