AMDGPU: Pull fneg out of extract_vector_elt

author Matt Arsenault <Matthew.Arsenault@amd.com>

Thu, 11 May 2017 17:26:25 +0000 (17:26 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Thu, 11 May 2017 17:26:25 +0000 (17:26 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Thu, 11 May 2017 17:26:25 +0000 (17:26 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Thu, 11 May 2017 17:26:25 +0000 (17:26 +0000)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index 915d1d9..f3bcfbb 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -567,13 +567,19 @@ static bool hasSourceMods(const SDNode *N) {
    case AMDGPUISD::INTERP_P1:
    case AMDGPUISD::INTERP_P2:
    case AMDGPUISD::DIV_SCALE:
+
+  // TODO: Should really be looking at the users of the bitcast. These are
+  // problematic because bitcasts are used to legalize all stores to integer
+  // types.
+  case ISD::BITCAST:
      return false;
    default:
      return true;
    }
  }
  
-static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) {
+bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
+                                                 unsigned CostThreshold) {
    // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
    // it is truly free to use a source modifier in all cases. If there are
    // multiple users but for each one will necessitate using VOP3, there will be
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

index e1a5a20..4c588a7 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -132,6 +132,8 @@ public:
      return false;
    }
  
+  static bool allUsesHaveSourceMods(const SDNode *N,
+                                    unsigned CostThreshold = 4);
    bool isFAbsFree(EVT VT) const override;
    bool isFNegFree(EVT VT) const override;
    bool isTruncateFree(EVT Src, EVT Dest) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index cc93c27..f815337 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -488,6 +488,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
    setTargetDAGCombine(ISD::FCANONICALIZE);
    setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
    setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
  
    // All memory operations. Some folding on the pointer operand is done to help
    // matching the constant offsets in the addressing modes.
@@ -4604,6 +4605,24 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
    return SDValue();
  }
  
+SDValue SITargetLowering::performExtractVectorEltCombine(
+  SDNode *N, DAGCombinerInfo &DCI) const {
+  SDValue Vec = N->getOperand(0);
+
+  SelectionDAG &DAG= DCI.DAG;
+  if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
+    SDLoc SL(N);
+    EVT EltVT = N->getValueType(0);
+    SDValue Idx = N->getOperand(1);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                              Vec.getOperand(0), Idx);
+    return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
+  }
+
+  return SDValue();
+}
+
+
  unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
                                            const SDNode *N0,
                                            const SDNode *N1) const {
@@ -4891,6 +4910,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
  
      break;
    }
+  case ISD::EXTRACT_VECTOR_ELT:
+    return performExtractVectorEltCombine(N, DCI);
    }
    return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
  }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h

index d177777..046e677 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -100,6 +100,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
    SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
    SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
    SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  
    unsigned getFusedOpcode(const SelectionDAG &DAG,
                            const SDNode *N0, const SDNode *N1) const;
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll

index 1c0e9a2..66bf9d0 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -1471,11 +1471,10 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addr
  ; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
  ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
  ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
-; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
-; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
-; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
-; GCN: buffer_store_dword [[MUL]]
+; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
+; GCN-NEXT: buffer_store_dword [[ADD]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
  define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
    %tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll

index 626a0b5..ed36666 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -1,6 +1,6 @@
  ; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
  
  ; FIXME: Should be able to do scalar op
  ; GCN-LABEL: {{^}}s_fneg_f16:
@@ -129,6 +129,41 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x
    ret void
  }
  
+; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16:
+; GCN: flat_load_dword [[VAL:v[0-9]+]]
+; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}
+; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+
+; GFX89: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]]
+; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]]
+; GFX89-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]]
+define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
+  %elt0 = extractelement <2 x half> %fneg, i32 0
+  %elt1 = extractelement <2 x half> %fneg, i32 1
+
+  %fmul0 = fmul half %elt0, 4.0
+  %fadd1 = fadd half %elt1, 2.0
+  store volatile half %fmul0, half addrspace(1)* undef
+  store volatile half %fadd1, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_extract_fneg_no_fold_v2f16:
+; GCN: flat_load_dword [[VAL:v[0-9]+]]
+; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VAL]]
+; GCN: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[NEG]]
+define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
+  %elt0 = extractelement <2 x half> %fneg, i32 0
+  %elt1 = extractelement <2 x half> %fneg, i32 1
+  store volatile half %elt0, half addrspace(1)* undef
+  store volatile half %elt1, half addrspace(1)* undef
+  ret void
+}
+
  declare i32 @llvm.amdgcn.workitem.id.x() #1
  
  attributes #0 = { nounwind }
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Thu, 11 May 2017 17:26:25 +0000 (17:26 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Thu, 11 May 2017 17:26:25 +0000 (17:26 +0000)
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIISelLowering.h		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fneg-combines.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fneg.f16.ll		patch \| blob \| history