[AMDGPU] Fix infinite loop with fma combines

author Austin Kerbow <Austin.Kerbow@amd.com>

Tue, 4 Feb 2020 01:08:26 +0000 (17:08 -0800)

committer Austin Kerbow <Austin.Kerbow@amd.com>

Tue, 4 Feb 2020 21:11:09 +0000 (13:11 -0800)
author Austin Kerbow <Austin.Kerbow@amd.com>
Tue, 4 Feb 2020 01:08:26 +0000 (17:08 -0800)
committer Austin Kerbow <Austin.Kerbow@amd.com>
Tue, 4 Feb 2020 21:11:09 +0000 (13:11 -0800)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index 5ba95fa..a223c2d 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -734,6 +734,26 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
    }
  }
  
+char AMDGPUTargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
+                                              bool LegalOperations,
+                                              bool ForCodeSize,
+                                              unsigned Depth) const {
+  switch (Op.getOpcode()) {
+    case ISD::FMA:
+    case ISD::FMAD: {
+      // Negating a fma is not free if it has users without source mods.
+      if (!allUsesHaveSourceMods(Op.getNode()))
+        return 0;
+      break;
+    }
+    default:
+      break;
+  }
+
+  return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations,
+                                            ForCodeSize, Depth);
+}
+
  //===---------------------------------------------------------------------===//
  // Target Properties
  //===---------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

index 7b269e8..3847be2 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -172,6 +172,9 @@ public:
    bool isZExtFree(EVT Src, EVT Dest) const override;
    bool isZExtFree(SDValue Val, EVT VT2) const override;
  
+  char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations,
+                          bool ForCodeSize, unsigned Depth) const override;
+
    bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
  
    MVT getVectorIdxTy(const DataLayout &) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll

index 3b3c87f..a962a3b 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -10,6 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
  declare double @llvm.fabs.f64(double) #0
  declare double @llvm.fma.f64(double, double, double) #0
  declare float @llvm.fma.f32(float, float, float) #0
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0
  
  ; (fadd (fmul x, y), z) -> (fma x, y, z)
  ; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
@@ -628,12 +629,12 @@ define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
  }
  
  ; Make sure negative constant cancels out fneg
-; GCN-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32:
-; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
-; GCN-NOT: [[A]]
-; GCN-NOT: [[B]]
-; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]]
+; SI-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32:
+; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI-NOT: [[A]]
+; SI-NOT: [[B]]
+; SI: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]]
  define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
    %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -650,12 +651,12 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, flo
    ret void
  }
  
-; GCN-LABEL: {{^}}fma_2.0_neg_a_b_f32:
-; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
-; GCN-NOT: [[A]]
-; GCN-NOT: [[B]]
-; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]]
+; SI-LABEL: {{^}}fma_2.0_neg_a_b_f32:
+; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI-NOT: [[A]]
+; SI-NOT: [[B]]
+; SI: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]]
  define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
    %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -672,6 +673,30 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float a
    ret void
  }
  
+; SI-LABEL: {{^}}fma_neg_b_c_v4f32:
+; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
+; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
+; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
+; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
+define amdgpu_kernel void @fma_neg_b_c_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.0 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %gep.1, i32 2
+  %gep.out = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
+
+  %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %gep.0
+  %tmp1 = load <4 x float>, <4 x float> addrspace(1)* %gep.1
+  %tmp2 = load <4 x float>, <4 x float> addrspace(1)* %gep.2
+
+  %fneg0 = fneg fast <4 x float> %tmp0
+  %fneg1 = fneg fast <4 x float> %tmp1
+  %fma0 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %tmp2, <4 x float> %fneg0, <4 x float> %fneg1)
+
+  store <4 x float> %fma0, <4 x float> addrspace(1)* %gep.out
+  ret void
+}
+
  attributes #0 = { nounwind readnone }
  attributes #1 = { nounwind }
-
+attributes #2 = { nounwind "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll

index 133afd4..f0aeb22 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -1399,6 +1399,28 @@ define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrs
    ret void
  }
  
+; GCN-LABEL: {{^}}v_fneg_fmad_v4f32:
+
+; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
+; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
+; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
+; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
+define amdgpu_kernel void @v_fneg_fmad_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %a.ptr, <4 x float> addrspace(1)* %b.ptr, <4 x float> addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile <4 x float>, <4 x float> addrspace(1)* %a.gep
+  %b = load volatile <4 x float>, <4 x float> addrspace(1)* %b.gep
+  %c = load volatile <4 x float>, <4 x float> addrspace(1)* %c.gep
+  %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+  %fneg = fneg <4 x float> %fma
+  store <4 x float> %fneg, <4 x float> addrspace(1)* %out.gep
+  ret void
+}
+
  ; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32:
  ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
  ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
@@ -2520,6 +2542,7 @@ define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %
  declare i32 @llvm.amdgcn.workitem.id.x() #1
  declare float @llvm.fma.f32(float, float, float) #1
  declare float @llvm.fmuladd.f32(float, float, float) #1
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
  declare float @llvm.sin.f32(float) #1
  declare float @llvm.trunc.f32(float) #1
  declare float @llvm.round.f32(float) #1
author	Austin Kerbow <Austin.Kerbow@amd.com>
	Tue, 4 Feb 2020 01:08:26 +0000 (17:08 -0800)
committer	Austin Kerbow <Austin.Kerbow@amd.com>
	Tue, 4 Feb 2020 21:11:09 +0000 (13:11 -0800)
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fma-combine.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fneg-combines.ll		patch \| blob \| history