AMDGPU: Use splat vectors for undefs when folding canonicalize

author Matt Arsenault <Matthew.Arsenault@amd.com>

Sun, 12 Aug 2018 08:42:54 +0000 (08:42 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Sun, 12 Aug 2018 08:42:54 +0000 (08:42 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Sun, 12 Aug 2018 08:42:54 +0000 (08:42 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Sun, 12 Aug 2018 08:42:54 +0000 (08:42 +0000)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index 226cfeb..ba8a3a5 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6989,27 +6989,42 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
  
    // TODO: This could be better with wider vectors that will be split to v2f16,
    // and to consider uses since there aren't that many packed operations.
-  if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16) {
+  if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
+      isTypeLegal(MVT::v2f16)) {
      SDLoc SL(N);
      SDValue NewElts[2];
      SDValue Lo = N0.getOperand(0);
      SDValue Hi = N0.getOperand(1);
+    EVT EltVT = Lo.getValueType();
+
      if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
        for (unsigned I = 0; I != 2; ++I) {
          SDValue Op = N0.getOperand(I);
-        EVT EltVT = Op.getValueType();
          if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
            NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
                                                CFP->getValueAPF());
          } else if (Op.isUndef()) {
-          // This would ordinarily be folded to a qNaN. Since this may be half
-          // of a packed operation, it may be cheaper to use a 0.
-          NewElts[I] = DAG.getConstantFP(0.0f, SL, EltVT);
+          // Handled below based on what the other operand is.
+          NewElts[I] = Op;
          } else {
            NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
          }
        }
  
+      // If one half is undef, and one is constant, perfer a splat vector rather
+      // than the normal qNaN. If it's a register, prefer 0.0 since that's
+      // cheaper to use and may be free with a packed operation.
+      if (NewElts[0].isUndef()) {
+        if (isa<ConstantFPSDNode>(NewElts[1]))
+          NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
+            NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
+      }
+
+      if (NewElts[1].isUndef()) {
+        NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
+          NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
+      }
+
        return DAG.getBuildVector(VT, SL, NewElts);
      }
    }
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll

index baef270..3c70691 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -688,6 +688,38 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out,
    ret void
  }
  
+; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts0:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GFX9-NOT: [[A]]
+; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
+  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>)
+  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>)
+
+  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts1:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GFX9-NOT: [[A]]
+; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
+  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
+  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
+
+  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
  ; GCN-LABEL: {{^}}v_clamp_diff_source_f32:
  ; GCN: v_add_f32_e32 [[A:v[0-9]+]]
  ; GCN: v_add_f32_e32 [[B:v[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll

index 4689bc4..636028b 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -565,20 +565,71 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
  }
  
  ; GCN-LABEL: {{^}}v_test_canonicalize_undef_reg_v2f16:
-; GFX9: s_waitcnt
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 0
-; GFX9-NEXT: s_setpc_b64
-
-; VI: s_waitcnt
-; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_setpc_b64
+; GFX89: s_waitcnt
+; GFX89-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX89-NEXT: s_setpc_b64
  define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
    %vec = insertelement <2 x half> undef, half %val, i32 1
    %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
    ret <2 x half> %canonicalized
  }
  
+; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_imm_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v1, 1.0
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
+  %vec = insertelement <2 x half> undef, half 1.0, i32 1
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+  ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_imm_lo_undef_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 1.0
+; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
+  %vec = insertelement <2 x half> undef, half 1.0, i32 0
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+  ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_k_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v1, 0x41800000
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
+  %vec = insertelement <2 x half> undef, half 16.0, i32 1
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+  ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_k_lo_undef_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 0x41800000
+; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
+  %vec = insertelement <2 x half> undef, half 16.0, i32 0
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+  ret <2 x half> %canonicalized
+}
+
  ; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:
  ; GFX9: s_waitcnt
  ; GFX9-DAG: v_max_f16_e32 v0, v0, v0
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Sun, 12 Aug 2018 08:42:54 +0000 (08:42 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Sun, 12 Aug 2018 08:42:54 +0000 (08:42 +0000)
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/clamp.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll		patch \| blob \| history