DAG: Fold fma (fneg x), K, y -> fma x, -K, y

author Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 27 Oct 2017 09:06:07 +0000 (09:06 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 27 Oct 2017 09:06:07 +0000 (09:06 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 27 Oct 2017 09:06:07 +0000 (09:06 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 27 Oct 2017 09:06:07 +0000 (09:06 +0000)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index b79ff7f..a64a793 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10123,6 +10123,14 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
        // TODO: The FMA node should have flags that propagate to this node.
        return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg);
      }
+
+    // fma (fneg x), K, y -> fma x -K, y
+    if (N0.getOpcode() == ISD::FNEG &&
+        (TLI.isOperationLegal(ISD::ConstantFP, VT) ||
+         (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT)))) {
+      return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0),
+                         DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2);
+    }
    }
  
    if (Options.UnsafeFPMath) {
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll

index 7526d08..132f053 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -625,5 +625,51 @@ define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
    ret void
  }
  
+; Make sure negative constant cancels out fneg
+; GCN-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN-NOT: [[A]]
+; GCN-NOT: [[B]]
+; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]]
+define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load volatile float, float addrspace(1)* %gep.0
+  %r2 = load volatile float, float addrspace(1)* %gep.1
+
+  %r1.fneg = fsub float -0.000000e+00, %r1
+
+  %r3 = tail call float @llvm.fma.f32(float -2.0, float %r1.fneg, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_2.0_neg_a_b_f32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN-NOT: [[A]]
+; GCN-NOT: [[B]]
+; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]]
+define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load volatile float, float addrspace(1)* %gep.0
+  %r2 = load volatile float, float addrspace(1)* %gep.1
+
+  %r1.fneg = fsub float -0.000000e+00, %r1
+
+  %r3 = tail call float @llvm.fma.f32(float 2.0, float %r1.fneg, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
  attributes #0 = { nounwind readnone }
  attributes #1 = { nounwind }
+
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll

index 980d68c..5f9e965 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -154,7 +154,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half
  ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
  ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
  
-; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]]
+; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
  ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
  define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -178,7 +178,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out,
  ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
  ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
  
-; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]]
+; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
  ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
  define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll

index 4b1e41f..fee3c95 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
@@ -218,7 +218,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, flo
  ; SI-FLUSH: buffer_store_dword [[R2]]
  ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
  
-; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]]
+; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
  
  ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
  ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
@@ -249,7 +249,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out,
  ; SI-FLUSH: buffer_store_dword [[R2]]
  ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
  
-; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]]
+; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
  
  ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
  ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 27 Oct 2017 09:06:07 +0000 (09:06 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 27 Oct 2017 09:06:07 +0000 (09:06 +0000)
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fma-combine.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll		patch \| blob \| history