[ARM,MVE] Add ACLE intrinsics for VCVT.F32.F16 family.

author Simon Tatham <simon.tatham@arm.com>

Mon, 2 Mar 2020 09:06:00 +0000 (09:06 +0000)

committer Simon Tatham <simon.tatham@arm.com>

Mon, 2 Mar 2020 10:33:30 +0000 (10:33 +0000)
author Simon Tatham <simon.tatham@arm.com>
Mon, 2 Mar 2020 09:06:00 +0000 (09:06 +0000)
committer Simon Tatham <simon.tatham@arm.com>
Mon, 2 Mar 2020 10:33:30 +0000 (10:33 +0000)
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td

index c64a75f..dfdb101 100644 (file)
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -453,6 +453,15 @@ foreach half = [ "b", "t" ] in {
        VecOf<f16>, (args VecOf<f16>:$inactive, Vector:$a, PredOf<f32>:$pred),
        (IRInt<"vcvt_narrow_predicated"> $inactive, $a, halfconst, $pred)>;
    } // params = [f32], pnt = PNT_None
+
+  let params = [f16], pnt = PNT_None in {
+    def vcvt#half#q_f32: Intrinsic<VecOf<f32>, (args Vector:$a),
+      (IRInt<"vcvt_widen"> $a, halfconst)>;
+    defm vcvt#half#q: IntrinsicMX<
+      VecOf<f32>, (args Vector:$a, PredOf<f32>:$pred),
+      (IRInt<"vcvt_widen_predicated"> $inactive, $a, halfconst, $pred),
+      1, "_f32">;
+  } // params = [f16], pnt = PNT_None
  } // loop over half = "b", "t"
  
  multiclass float_int_conversions<Type FScalar, Type IScalar, IRBuilderBase ftoi, IRBuilderBase itof> {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c b/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c

index 0391b77..d03ac31 100644 (file)
--- a/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c
@@ -697,3 +697,71 @@ uint32x4_t test_vcvtq_x_n_u32_f32(float32x4_t a, mve_pred16_t p)
  {
      return vcvtq_x_n_u32_f32(a, 32, p);
  }
+
+// CHECK-LABEL: @test_vcvtbq_f32_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> [[A:%.*]], i32 0)
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t test_vcvtbq_f32_f16(float16x8_t a)
+{
+    return vcvtbq_f32_f16(a);
+}
+
+// CHECK-LABEL: @test_vcvttq_f32_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> [[A:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t test_vcvttq_f32_f16(float16x8_t a)
+{
+    return vcvttq_f32_f16(a);
+}
+
+// CHECK-LABEL: @test_vcvtbq_m_f32_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> [[INACTIVE:%.*]], <8 x half> [[A:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vcvtbq_m_f32_f16(float32x4_t inactive, float16x8_t a, mve_pred16_t p)
+{
+    return vcvtbq_m_f32_f16(inactive, a, p);
+}
+
+// CHECK-LABEL: @test_vcvttq_m_f32_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> [[INACTIVE:%.*]], <8 x half> [[A:%.*]], i32 1, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vcvttq_m_f32_f16(float32x4_t inactive, float16x8_t a, mve_pred16_t p)
+{
+    return vcvttq_m_f32_f16(inactive, a, p);
+}
+
+// CHECK-LABEL: @test_vcvtbq_x_f32_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> undef, <8 x half> [[A:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vcvtbq_x_f32_f16(float16x8_t a, mve_pred16_t p)
+{
+    return vcvtbq_x_f32_f16(a, p);
+}
+
+// CHECK-LABEL: @test_vcvttq_x_f32_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> undef, <8 x half> [[A:%.*]], i32 1, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vcvttq_x_f32_f16(float16x8_t a, mve_pred16_t p)
+{
+    return vcvttq_x_f32_f16(a, p);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td

index 743d935..64c1faa 100644 (file)
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -911,8 +911,22 @@ multiclass MVEPredicatedM<list<LLVMType> rets, list<LLVMType> params,
            LLVMMatchType<0>, rets[0])], props>;
  }
  
+// Intrinsic with a predicated and a non-predicated case. The predicated case
+// has two additional parameters: inactive (the value for inactive lanes, can
+// be undef) and predicate.
+multiclass MVEMXPredicated<list<LLVMType> rets, list<LLVMType> flags,
+                           list<LLVMType> params, LLVMType inactive,
+                           LLVMType predicate,
+                           list<IntrinsicProperty> props = [IntrNoMem]> {
+  def "":          Intrinsic<rets, flags # params, props>;
+  def _predicated: Intrinsic<rets, flags # [inactive] # params # [predicate],
+                             props>;
+}
+
  defm int_arm_mve_vcvt_narrow: MVEPredicated<[llvm_v8f16_ty],
     [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], llvm_v4i1_ty>;
+defm int_arm_mve_vcvt_widen: MVEMXPredicated<[llvm_v4f32_ty], [],
+   [llvm_v8f16_ty, llvm_i32_ty], llvm_v4f32_ty, llvm_v4i1_ty>;
  
  defm int_arm_mve_vldr_gather_base: MVEPredicated<
     [llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty],
@@ -1044,18 +1058,6 @@ def int_arm_mve_vmull_poly: Intrinsic<
     [llvm_anyvector_ty],
     [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrNoMem]>;
  
-// Intrinsic with a predicated and a non-predicated case. The predicated case
-// has two additional parameters: inactive (the value for inactive lanes, can
-// be undef) and predicate.
-multiclass MVEMXPredicated<list<LLVMType> rets, list<LLVMType> flags,
-                           list<LLVMType> params, LLVMType inactive,
-                           LLVMType predicate,
-                           list<IntrinsicProperty> props = [IntrNoMem]> {
-  def "":          Intrinsic<rets, flags # params, props>;
-  def _predicated: Intrinsic<rets, flags # [inactive] # params # [predicate],
-                             props>;
-}
-
  // The first two parameters are compile-time constants:
  // * Halving: 0 means  halving (vhcaddq), 1 means non-halving (vcaddq) 
  //            instruction. Note: the flag is inverted to match the corresonding
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td

index 3942602..b4b3c68 100644 (file)
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -4515,6 +4515,17 @@ multiclass MVE_VCVT_f2h_m<string iname, int half> {
  
  multiclass MVE_VCVT_h2f_m<string iname, int half> {
    def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half, (ins), vpred_r, "">;
+  defvar Inst = !cast<Instruction>(NAME);
+
+  let Predicates = [HasMVEFloat] in {
+    def : Pat<(v4f32 (int_arm_mve_vcvt_widen (v8f16 MQPR:$Qm), (i32 half))),
+              (v4f32 (Inst (v8f16 MQPR:$Qm)))>;
+    def : Pat<(v4f32 (int_arm_mve_vcvt_widen_predicated
+                         (v4f32 MQPR:$inactive), (v8f16 MQPR:$Qm), (i32 half),
+                         (v4i1 VCCR:$mask))),
+              (v4f32 (Inst (v8f16 MQPR:$Qm), ARMVCCThen,
+                           (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>;
+  }
  }
  
  defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>;
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll

index 901b88f..e74b7cd 100644 (file)
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll
@@ -6,6 +6,8 @@ declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
  
  declare <8 x half> @llvm.arm.mve.vcvt.narrow(<8 x half>, <4 x float>, i32)
  declare <8 x half> @llvm.arm.mve.vcvt.narrow.predicated(<8 x half>, <4 x float>, i32, <4 x i1>)
+declare <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half>, i32)
+declare <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float>, <8 x half>, i32, <4 x i1>)
  
  declare <8 x half> @llvm.arm.mve.vcvt.fix.v8f16.v8i16(i32, <8 x i16>, i32)
  declare <4 x float> @llvm.arm.mve.vcvt.fix.v4f32.v4i32(i32, <4 x i32>, i32)
@@ -367,3 +369,51 @@ entry:
    %2 = call <4 x i32> @llvm.arm.mve.vcvt.fix.predicated.v4i32.v4f32.v4i1(i32 1, <4 x i32> undef, <4 x float> %a, i32 32, <4 x i1> %1)
    ret <4 x i32> %2
  }
+
+define arm_aapcs_vfpcc <4 x float> @test_vcvtbq_f32_f16(<8 x half> %a) {
+; CHECK-LABEL: test_vcvtbq_f32_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> %a, i32 0)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vcvttq_f32_f16(<8 x half> %a) {
+; CHECK-LABEL: test_vcvttq_f32_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> %a, i32 1)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vcvtbq_m_f32_f16(<4 x float> %inactive, <8 x half> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vcvtbq_m_f32_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcvtbt.f32.f16 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> %inactive, <8 x half> %a, i32 0, <4 x i1> %1)
+  ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vcvttq_m_f32_f16(<4 x float> %inactive, <8 x half> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vcvttq_m_f32_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcvttt.f32.f16 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> %inactive, <8 x half> %a, i32 1, <4 x i1> %1)
+  ret <4 x float> %2
+}
author	Simon Tatham <simon.tatham@arm.com>
	Mon, 2 Mar 2020 09:06:00 +0000 (09:06 +0000)
committer	Simon Tatham <simon.tatham@arm.com>
	Mon, 2 Mar 2020 10:33:30 +0000 (10:33 +0000)
clang/include/clang/Basic/arm_mve.td		patch \| blob \| history
clang/test/CodeGen/arm-mve-intrinsics/vcvt.c		patch \| blob \| history
llvm/include/llvm/IR/IntrinsicsARM.td		patch \| blob \| history
llvm/lib/Target/ARM/ARMInstrMVE.td		patch \| blob \| history
llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll		patch \| blob \| history