AMDGPU: Add clamp bit to dot intrinsics

author Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>

Wed, 1 Aug 2018 01:31:30 +0000 (01:31 +0000)

committer Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>

Wed, 1 Aug 2018 01:31:30 +0000 (01:31 +0000)
author Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Wed, 1 Aug 2018 01:31:30 +0000 (01:31 +0000)
committer Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Wed, 1 Aug 2018 01:31:30 +0000 (01:31 +0000)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td

index 8555db0..9f36141 100644 (file)
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1191,7 +1191,7 @@ def int_amdgcn_ds_bpermute :
  // Deep learning intrinsics.
  //===----------------------------------------------------------------------===//
  
-// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c)
+// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp)
  //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
  def int_amdgcn_fdot2 :
    GCCBuiltin<"__builtin_amdgcn_fdot2">,
@@ -1200,12 +1200,13 @@ def int_amdgcn_fdot2 :
      [
        llvm_v2f16_ty, // %a
        llvm_v2f16_ty, // %b
-      llvm_float_ty  // %c
+      llvm_float_ty, // %c
+      llvm_i1_ty     // %clamp
      ],
      [IntrNoMem, IntrSpeculatable]
    >;
  
-// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c)
+// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
  //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
  def int_amdgcn_sdot2 :
    GCCBuiltin<"__builtin_amdgcn_sdot2">,
@@ -1214,12 +1215,13 @@ def int_amdgcn_sdot2 :
      [
        llvm_v2i16_ty, // %a
        llvm_v2i16_ty, // %b
-      llvm_i32_ty    // %c
+      llvm_i32_ty,   // %c
+      llvm_i1_ty     // %clamp
      ],
      [IntrNoMem, IntrSpeculatable]
    >;
  
-// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c)
+// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp)
  //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
  def int_amdgcn_udot2 :
    GCCBuiltin<"__builtin_amdgcn_udot2">,
@@ -1228,12 +1230,13 @@ def int_amdgcn_udot2 :
      [
        llvm_v2i16_ty, // %a
        llvm_v2i16_ty, // %b
-      llvm_i32_ty    // %c
+      llvm_i32_ty,   // %c
+      llvm_i1_ty     // %clamp
      ],
      [IntrNoMem, IntrSpeculatable]
    >;
  
-// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c)
+// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp)
  //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
  def int_amdgcn_sdot4 :
    GCCBuiltin<"__builtin_amdgcn_sdot4">,
@@ -1242,12 +1245,13 @@ def int_amdgcn_sdot4 :
      [
        llvm_i32_ty, // %a
        llvm_i32_ty, // %b
-      llvm_i32_ty  // %c
+      llvm_i32_ty, // %c
+      llvm_i1_ty   // %clamp
      ],
      [IntrNoMem, IntrSpeculatable]
    >;
  
-// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c)
+// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp)
  //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
  def int_amdgcn_udot4 :
    GCCBuiltin<"__builtin_amdgcn_udot4">,
@@ -1256,12 +1260,13 @@ def int_amdgcn_udot4 :
      [
        llvm_i32_ty, // %a
        llvm_i32_ty, // %b
-      llvm_i32_ty  // %c
+      llvm_i32_ty, // %c
+      llvm_i1_ty   // %clamp
      ],
      [IntrNoMem, IntrSpeculatable]
    >;
  
-// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c)
+// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
  //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
  //        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
  def int_amdgcn_sdot8 :
@@ -1271,12 +1276,13 @@ def int_amdgcn_sdot8 :
      [
        llvm_i32_ty, // %a
        llvm_i32_ty, // %b
-      llvm_i32_ty  // %c
+      llvm_i32_ty, // %c
+      llvm_i1_ty   // %clamp
      ],
      [IntrNoMem, IntrSpeculatable]
    >;
  
-// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c)
+// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp)
  //   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
  //        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
  def int_amdgcn_udot8 :
@@ -1286,7 +1292,8 @@ def int_amdgcn_udot8 :
      [
        llvm_i32_ty, // %a
        llvm_i32_ty, // %b
-      llvm_i32_ty  // %c
+      llvm_i32_ty, // %c
+      llvm_i1_ty   // %clamp
      ],
      [IntrNoMem, IntrSpeculatable]
    >;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

index 96b7568..7442a59 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -342,8 +342,9 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
  def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
  
  def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
-                  SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
-                                       SDTCisFP<0>, SDTCisVec<1>]>,
+                  SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
+                                       SDTCisFP<0>, SDTCisVec<1>,
+                                       SDTCisInt<4>]>,
                    []>;
  
  def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index ee3c028..993c00d 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5010,7 +5010,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    case Intrinsic::amdgcn_fdot2:
      return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
-                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                       Op.getOperand(4));
    case Intrinsic::amdgcn_fmul_legacy:
      return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
                         Op.getOperand(1), Op.getOperand(2));
@@ -7613,8 +7614,10 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
        return SDValue();
  
      if ((Vec1 == Vec3 && Vec2 == Vec4) ||
-        (Vec1 == Vec4 && Vec2 == Vec3))
-      return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc);
+        (Vec1 == Vec4 && Vec2 == Vec3)) {
+      return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
+                         DAG.getTargetConstant(0, SL, MVT::i1));
+    }
    }
    return SDValue();
  }
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td

index 5c78ada..b51828b 100644 (file)
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -167,13 +167,30 @@ defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
  
  let SubtargetPredicate = HasDLInsts in {
  
-def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>;
-def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>;
-def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>;
-def V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>;
-def V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4>;
-def V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8>;
-def V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8>;
+def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
+def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
+def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
+def V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+
+multiclass DotPats<SDPatternOperator dot_op,
+                   VOP3PInst dot_inst> {
+  def : GCNPat <
+    (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)),
+            (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)),
+            (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp),
+    (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1imm $clamp))>;
+}
+
+defm : DotPats<AMDGPUfdot2, V_DOT2_F32_F16>;
+defm : DotPats<int_amdgcn_sdot2, V_DOT2_I32_I16>;
+defm : DotPats<int_amdgcn_udot2, V_DOT2_U32_U16>;
+defm : DotPats<int_amdgcn_sdot4, V_DOT4_I32_I8>;
+defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>;
+defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>;
+defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>;
  
  } // End SubtargetPredicate = HasDLInsts
  
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll

index 65ab3e0..7efb185 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
@@ -1,10 +1,10 @@
  ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX906
  
-declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c)
+declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp)
  
-; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2
-; GFX906: v_dot2_f32_f16
-define amdgpu_kernel void @test_llvm_amdgcn_fdot2(
+; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2_clamp
+; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_fdot2_clamp(
      float addrspace(1)* %r,
      <2 x half> addrspace(1)* %a,
      <2 x half> addrspace(1)* %b,
@@ -13,7 +13,23 @@ entry:
    %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
    %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
    %c.val = load float, float addrspace(1)* %c
-  %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val)
+  %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 1)
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2_no_clamp
+; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_fdot2_no_clamp(
+    float addrspace(1)* %r,
+    <2 x half> addrspace(1)* %a,
+    <2 x half> addrspace(1)* %b,
+    float addrspace(1)* %c) {
+entry:
+  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
+  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %c.val = load float, float addrspace(1)* %c
+  %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 0)
    store float %r.val, float addrspace(1)* %r
    ret void
  }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll

index 0d8f28b..f1894cc 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll
@@ -1,10 +1,10 @@
  ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
  
-declare i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c)
+declare i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp)
  
-; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2
-; GFX906: v_dot2_i32_i16
-define amdgpu_kernel void @test_llvm_amdgcn_sdot2(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2_clamp
+; GFX906: v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot2_clamp(
      i32 addrspace(1)* %r,
      <2 x i16> addrspace(1)* %a,
      <2 x i16> addrspace(1)* %b,
@@ -13,7 +13,23 @@ entry:
    %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
    %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
    %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2_no_clamp
+; GFX906: v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot2_no_clamp(
+    i32 addrspace(1)* %r,
+    <2 x i16> addrspace(1)* %a,
+    <2 x i16> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
+  %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0)
    store i32 %r.val, i32 addrspace(1)* %r
    ret void
  }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll

index 8b664e6..2651200 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
@@ -1,10 +1,10 @@
  ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
  
-declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c)
+declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 %clamp)
  
-; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4
-; GFX906: v_dot4_i32_i8
-define amdgpu_kernel void @test_llvm_amdgcn_sdot4(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_clamp
+; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot4_clamp(
      i32 addrspace(1)* %r,
      <4 x i8> addrspace(1)* %a,
      <4 x i8> addrspace(1)* %b,
@@ -15,7 +15,25 @@ entry:
    %a.val.cast = bitcast <4 x i8> %a.val to i32
    %b.val.cast = bitcast <4 x i8> %b.val to i32
    %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_no_clamp
+; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot4_no_clamp(
+    i32 addrspace(1)* %r,
+    <4 x i8> addrspace(1)* %a,
+    <4 x i8> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a
+  %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b
+  %a.val.cast = bitcast <4 x i8> %a.val to i32
+  %b.val.cast = bitcast <4 x i8> %b.val to i32
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
    store i32 %r.val, i32 addrspace(1)* %r
    ret void
  }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll

index e2466ea..456421c 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
@@ -1,10 +1,10 @@
  ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
  
-declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c)
+declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 %clamp)
  
-; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8
-; GFX906: v_dot8_i32_i4
-define amdgpu_kernel void @test_llvm_amdgcn_sdot8(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_clamp
+; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot8_clamp(
      i32 addrspace(1)* %r,
      <8 x i4> addrspace(1)* %a,
      <8 x i4> addrspace(1)* %b,
@@ -15,7 +15,25 @@ entry:
    %a.val.cast = bitcast <8 x i4> %a.val to i32
    %b.val.cast = bitcast <8 x i4> %b.val to i32
    %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_no_clamp
+; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot8_no_clamp(
+    i32 addrspace(1)* %r,
+    <8 x i4> addrspace(1)* %a,
+    <8 x i4> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a
+  %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b
+  %a.val.cast = bitcast <8 x i4> %a.val to i32
+  %b.val.cast = bitcast <8 x i4> %b.val to i32
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
    store i32 %r.val, i32 addrspace(1)* %r
    ret void
  }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll

index b2912cb..18ca71d 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll
@@ -1,10 +1,10 @@
  ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
  
-declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c)
+declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp)
  
-; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2
-; GFX906: v_dot2_u32_u16
-define amdgpu_kernel void @test_llvm_amdgcn_udot2(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_clamp
+; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_udot2_clamp(
      i32 addrspace(1)* %r,
      <2 x i16> addrspace(1)* %a,
      <2 x i16> addrspace(1)* %b,
@@ -13,7 +13,23 @@ entry:
    %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
    %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
    %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_no_clamp
+; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_udot2_no_clamp(
+    i32 addrspace(1)* %r,
+    <2 x i16> addrspace(1)* %a,
+    <2 x i16> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
+  %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0)
    store i32 %r.val, i32 addrspace(1)* %r
    ret void
  }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll

index 5ce060d..73d6a9c 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll
@@ -1,10 +1,10 @@
  ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
  
-declare i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c)
+declare i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c, i1 %clamp)
  
-; GCN-LABEL: {{^}}test_llvm_amdgcn_udot4
-; GFX906: v_dot4_u32_u8
-define amdgpu_kernel void @test_llvm_amdgcn_udot4(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_udot4_clamp
+; GFX906: v_dot4_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_udot4_clamp(
      i32 addrspace(1)* %r,
      <4 x i8> addrspace(1)* %a,
      <4 x i8> addrspace(1)* %b,
@@ -15,7 +15,25 @@ entry:
    %a.val.cast = bitcast <4 x i8> %a.val to i32
    %b.val.cast = bitcast <4 x i8> %b.val to i32
    %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_udot4_no_clamp
+; GFX906: v_dot4_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_udot4_no_clamp(
+    i32 addrspace(1)* %r,
+    <4 x i8> addrspace(1)* %a,
+    <4 x i8> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a
+  %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b
+  %a.val.cast = bitcast <4 x i8> %a.val to i32
+  %b.val.cast = bitcast <4 x i8> %b.val to i32
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
    store i32 %r.val, i32 addrspace(1)* %r
    ret void
  }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll

index 2599305..c2f80ca 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll
@@ -1,10 +1,10 @@
  ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
  
-declare i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c)
+declare i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 %clamp)
  
-; GCN-LABEL: {{^}}test_llvm_amdgcn_udot8
-; GFX906: v_dot8_u32_u4
-define amdgpu_kernel void @test_llvm_amdgcn_udot8(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_udot8_clamp
+; GFX906: v_dot8_u32_u4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_udot8_clamp(
      i32 addrspace(1)* %r,
      <8 x i4> addrspace(1)* %a,
      <8 x i4> addrspace(1)* %b,
@@ -15,7 +15,25 @@ entry:
    %a.val.cast = bitcast <8 x i4> %a.val to i32
    %b.val.cast = bitcast <8 x i4> %b.val to i32
    %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_udot8_no_clamp
+; GFX906: v_dot8_u32_u4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_udot8_no_clamp(
+    i32 addrspace(1)* %r,
+    <8 x i4> addrspace(1)* %a,
+    <8 x i4> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a
+  %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b
+  %a.val.cast = bitcast <8 x i4> %a.val to i32
+  %b.val.cast = bitcast <8 x i4> %b.val to i32
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
    store i32 %r.val, i32 addrspace(1)* %r
    ret void
  }
author	Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
	Wed, 1 Aug 2018 01:31:30 +0000 (01:31 +0000)
committer	Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
	Wed, 1 Aug 2018 01:31:30 +0000 (01:31 +0000)
llvm/include/llvm/IR/IntrinsicsAMDGPU.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/VOP3PInstructions.td		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll		patch \| blob \| history