AMDGPU: Fix high 16-bit optimization on gfx9

author Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 15 Jun 2021 21:56:54 +0000 (17:56 -0400)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 22 Jun 2021 17:16:45 +0000 (13:16 -0400)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 15 Jun 2021 21:56:54 +0000 (17:56 -0400)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 22 Jun 2021 17:16:45 +0000 (13:16 -0400)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

index 3eae317..a3106de 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -44,63 +44,6 @@ class R600InstrInfo;
  
  namespace {
  
-// Instructions that will be lowered with a final instruction that zeros the
-// high result bits.
-// XXX - only need to list legal operations.
-static bool fp16SrcZerosHighBits(unsigned Opc) {
-  switch (Opc) {
-  case ISD::FADD:
-  case ISD::FSUB:
-  case ISD::FMUL:
-  case ISD::FDIV:
-  case ISD::FREM:
-  case ISD::FMA:
-  case ISD::FMAD:
-  case ISD::FCANONICALIZE:
-  case ISD::FP_ROUND:
-  case ISD::UINT_TO_FP:
-  case ISD::SINT_TO_FP:
-  case ISD::FABS:
-    // Fabs is lowered to a bit operation, but it's an and which will clear the
-    // high bits anyway.
-  case ISD::FSQRT:
-  case ISD::FSIN:
-  case ISD::FCOS:
-  case ISD::FPOWI:
-  case ISD::FPOW:
-  case ISD::FLOG:
-  case ISD::FLOG2:
-  case ISD::FLOG10:
-  case ISD::FEXP:
-  case ISD::FEXP2:
-  case ISD::FCEIL:
-  case ISD::FTRUNC:
-  case ISD::FRINT:
-  case ISD::FNEARBYINT:
-  case ISD::FROUND:
-  case ISD::FFLOOR:
-  case ISD::FMINNUM:
-  case ISD::FMAXNUM:
-  case AMDGPUISD::FRACT:
-  case AMDGPUISD::CLAMP:
-  case AMDGPUISD::COS_HW:
-  case AMDGPUISD::SIN_HW:
-  case AMDGPUISD::FMIN3:
-  case AMDGPUISD::FMAX3:
-  case AMDGPUISD::FMED3:
-  case AMDGPUISD::FMAD_FTZ:
-  case AMDGPUISD::RCP:
-  case AMDGPUISD::RSQ:
-  case AMDGPUISD::RCP_IFLAG:
-  case AMDGPUISD::LDEXP:
-    return true;
-  default:
-    // fcopysign, select and others may be lowered to 32-bit bit operations
-    // which don't zero the high bits.
-    return false;
-  }
-}
-
  static bool isNullConstantOrUndef(SDValue V) {
    if (V.isUndef())
      return true;
@@ -164,6 +107,10 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
  
    bool EnableLateStructurizeCFG;
  
+  // Instructions that will be lowered with a final instruction that zeros the
+  // high result bits.
+  bool fp16SrcZerosHighBits(unsigned Opc) const;
+
  public:
    explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
                                CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
@@ -459,6 +406,68 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
    return SelectionDAGISel::runOnMachineFunction(MF);
  }
  
+bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
+  // XXX - only need to list legal operations.
+  switch (Opc) {
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::FCANONICALIZE:
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+  case ISD::FABS:
+    // Fabs is lowered to a bit operation, but it's an and which will clear the
+    // high bits anyway.
+  case ISD::FSQRT:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FPOWI:
+  case ISD::FPOW:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FCEIL:
+  case ISD::FTRUNC:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FROUND:
+  case ISD::FFLOOR:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case AMDGPUISD::FRACT:
+  case AMDGPUISD::CLAMP:
+  case AMDGPUISD::COS_HW:
+  case AMDGPUISD::SIN_HW:
+  case AMDGPUISD::FMIN3:
+  case AMDGPUISD::FMAX3:
+  case AMDGPUISD::FMED3:
+  case AMDGPUISD::FMAD_FTZ:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RSQ:
+  case AMDGPUISD::RCP_IFLAG:
+  case AMDGPUISD::LDEXP:
+    // On gfx10, all 16-bit instructions preserve the high bits.
+    return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
+  case ISD::FP_ROUND:
+    // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
+    // high bits on gfx9.
+    // TODO: If we had the source node we could see if the source was fma/mad
+    return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+  case ISD::FMA:
+  case ISD::FMAD:
+  case AMDGPUISD::DIV_FIXUP:
+    return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+  default:
+    // fcopysign, select and others may be lowered to 32-bit bit operations
+    // which don't zero the high bits.
+    return false;
+  }
+}
+
  bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
    assert(Subtarget->d16PreservesUnusedBits());
    MVT VT = N->getValueType(0).getSimpleVT();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td

index 8e24ce0..9778a6f 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1995,10 +1995,16 @@ def : GCNPat <
  
  // Eliminate a zero extension from an fp16 operation if it already
  // zeros the high bits of the 32-bit register.
+//
+// This is complicated on gfx9+. Some instructions maintain the legacy
+// zeroing behavior, but others preserve the high bits. Some have a
+// control bit to change the behavior. We can't simply say with
+// certainty what the source behavior is without more context on how
+// the src is lowered. e.g. fptrunc + fma may be lowered to a
+// v_fma_mix* instruction which does not zero, or may not.
  def : GCNPat<
    (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
-  (COPY VSrc_b16:$src)
->;
+  (COPY VSrc_b16:$src)>;
  
  def : GCNPat <
    (i32 (trunc i64:$a)),
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll

index be53ffe..abdfd2c 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -140,7 +140,8 @@ entry:
  ; GCN-LABEL: {{^}}fptrunc_f32_to_f16_zext_i32:
  ; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
  ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
-; GCN-NOT: v[[R_F16]]
+; SIVI-NOT: v[[R_F16]]
+; GFX9-NEXT: v_and_b32_e32 v[[R_F16]], 0xffff, v[[R_F16]]
  ; GCN: buffer_store_dword v[[R_F16]]
  define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
      i32 addrspace(1)* %r,
@@ -157,7 +158,8 @@ entry:
  ; GCN-LABEL: {{^}}fptrunc_fabs_f32_to_f16_zext_i32:
  ; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
  ; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]|
-; GCN-NOT: v[[R_F16]]
+; SIVI-NOT: v[[R_F16]]
+; GFX9-NEXT: v_and_b32_e32 v[[R_F16]], 0xffff, v[[R_F16]]
  ; GCN: buffer_store_dword v[[R_F16]]
  define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
      i32 addrspace(1)* %r,
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll

index 6fc0ce9..ed2202c 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX906 %s
  ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
  
  ; GCN-LABEL: {{^}}shl_i16:
@@ -188,3 +190,93 @@ define i32 @max_i16_zext_i32(i16 %x, i16 %y) {
    %zext = zext i16 %res to i32
    ret i32 %zext
  }
+
+; GCN-LABEL: {{^}}zext_fadd_f16:
+; GFX8: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1
+; GFX9-NEXT: s_setpc_b64
+
+; GFX10: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, [[ADD]]
+define i32 @zext_fadd_f16(half %x, half %y) {
+  %add = fadd half %x, %y
+  %cast = bitcast half %add to i16
+  %zext = zext i16 %cast to i32
+  ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}zext_fma_f16:
+; GFX8: v_fma_f16 [[FMA:v[0-9]+]], v0, v1, v2
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_fma_f16 [[FMA:v[0-9]+]], v0, v1, v2
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, [[FMA]]
+
+; GFX10: v_fmac_f16_e32 [[FMA:v[0-9]+]], v0, v1
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, [[FMA]]
+define i32 @zext_fma_f16(half %x, half %y, half %z) {
+  %fma = call half @llvm.fma.f16(half %x, half %y, half %z)
+  %cast = bitcast half %fma to i16
+  %zext = zext i16 %cast to i32
+  ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}zext_div_fixup_f16:
+; GFX8: v_div_fixup_f16 v0, v0, v1, v2
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_div_fixup_f16 v0, v0, v1, v2
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX10: v_div_fixup_f16 v0, v0, v1, v2
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+define i32 @zext_div_fixup_f16(half %x, half %y, half %z) {
+  %div.fixup = call half @llvm.amdgcn.div.fixup.f16(half %x, half %y, half %z)
+  %cast = bitcast half %div.fixup to i16
+  %zext = zext i16 %cast to i32
+  ret i32 %zext
+}
+
+; We technically could eliminate the and on gfx9 here but we don't try
+; to inspect the source of the fptrunc. We're only worried about cases
+; that lower to v_fma_mix* instructions.
+
+; GCN-LABEL: {{^}}zext_fptrunc_f16:
+; GFX8: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX10: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+define i32 @zext_fptrunc_f16(float %x) {
+  %fptrunc = fptrunc float %x to half
+  %cast = bitcast half %fptrunc to i16
+  %zext = zext i16 %cast to i32
+  ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}zext_fptrunc_fma_f16:
+; GFX900: v_fma_f32 v0, v0, v1, v2
+; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX906: v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX10: v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) {
+  %fma = call float @llvm.fma.f32(float %x, float %y, float %z)
+  %fptrunc = fptrunc float %fma to half
+  %cast = bitcast half %fptrunc to i16
+  %zext = zext i16 %cast to i32
+  ret i32 %zext
+}
+
+declare half @llvm.amdgcn.div.fixup.f16(half, half, half)
+declare half @llvm.fma.f16(half, half, half)
+declare float @llvm.fma.f32(float, float, float)
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 15 Jun 2021 21:56:54 +0000 (17:56 -0400)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 22 Jun 2021 17:16:45 +0000 (13:16 -0400)
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/preserve-hi16.ll		patch \| blob \| history