From b08d2ddd69b4a2209930b31fe456b4d7c1ce148f Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Mon, 2 Mar 2020 09:06:00 +0000
Subject: [PATCH] [ARM,MVE] Add ACLE intrinsics for VCVT.F32.F16 family.

Summary:
These instructions make a vector of `<4 x float>` by widening every
other lane of a vector of `<8 x half>`.

I wondered about representing these using standard IR, along the lines
of a shufflevector to extract elements of the input into a `<4 x half>`
followed by an `fpext` to turn that into `<4 x float>`. But it looks as
if that would take a lot of work in isel lowering to make it match any
pattern I could sensibly write in Tablegen, and also I haven't been
able to think of any other case where that pattern might be generated
in IR, so there wouldn't be any extra code generation win from doing
it that way.

Therefore, I've just used another target-specific intrinsic. We can
always change it to the other way later if anyone thinks of a good
reason.

(In order to put the intrinsic definition near similar things in
`IntrinsicsARM.td`, I've also lifted the definition of the
`MVEMXPredicated` multiclass higher up the file, without changing it.)

Reviewers: MarkMurrayARM, dmgreen, miyuki, ostannard

Reviewed By: miyuki

Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D75254
---
 clang/include/clang/Basic/arm_mve.td            |  9 ++++
 clang/test/CodeGen/arm-mve-intrinsics/vcvt.c    | 68 +++++++++++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsARM.td           | 26 +++++-----
 llvm/lib/Target/ARM/ARMInstrMVE.td              | 11 ++++
 llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll | 50 ++++++++++++++++++
 5 files changed, 152 insertions(+), 12 deletions(-)
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index c64a75f..dfdb101 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -453,6 +453,15 @@ foreach half = [ "b", "t" ] in {
       VecOf<f16>, (args VecOf<f16>:$inactive, Vector:$a, PredOf<f32>:$pred),
       (IRInt<"vcvt_narrow_predicated"> $inactive, $a, halfconst, $pred)>;
   } // params = [f32], pnt = PNT_None
+
+  let params = [f16], pnt = PNT_None in {
+    def vcvt#half#q_f32: Intrinsic<VecOf<f32>, (args Vector:$a),
+      (IRInt<"vcvt_widen"> $a, halfconst)>;
+    defm vcvt#half#q: IntrinsicMX<
+      VecOf<f32>, (args Vector:$a, PredOf<f32>:$pred),
+      (IRInt<"vcvt_widen_predicated"> $inactive, $a, halfconst, $pred),
+      1, "_f32">;
+  } // params = [f16], pnt = PNT_None
 } // loop over half = "b", "t"
 
 multiclass float_int_conversions<Type FScalar, Type IScalar, IRBuilderBase ftoi, IRBuilderBase itof> {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c b/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c
index 0391b77..d03ac31 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c
@@ -697,3 +697,71 @@ uint32x4_t test_vcvtq_x_n_u32_f32(float32x4_t a, mve_pred16_t p)
 {
     return vcvtq_x_n_u32_f32(a, 32, p);
 }
+
+// CHECK-LABEL: @test_vcvtbq_f32_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> [[A:%.*]], i32 0)
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t test_vcvtbq_f32_f16(float16x8_t a)
+{
+    return vcvtbq_f32_f16(a);
+}
+
+// CHECK-LABEL: @test_vcvttq_f32_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> [[A:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t test_vcvttq_f32_f16(float16x8_t a)
+{
+    return vcvttq_f32_f16(a);
+}
+
+// CHECK-LABEL: @test_vcvtbq_m_f32_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> [[INACTIVE:%.*]], <8 x half> [[A:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vcvtbq_m_f32_f16(float32x4_t inactive, float16x8_t a, mve_pred16_t p)
+{
+    return vcvtbq_m_f32_f16(inactive, a, p);
+}
+
+// CHECK-LABEL: @test_vcvttq_m_f32_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> [[INACTIVE:%.*]], <8 x half> [[A:%.*]], i32 1, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vcvttq_m_f32_f16(float32x4_t inactive, float16x8_t a, mve_pred16_t p)
+{
+    return vcvttq_m_f32_f16(inactive, a, p);
+}
+
+// CHECK-LABEL: @test_vcvtbq_x_f32_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> undef, <8 x half> [[A:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vcvtbq_x_f32_f16(float16x8_t a, mve_pred16_t p)
+{
+    return vcvtbq_x_f32_f16(a, p);
+}
+
+// CHECK-LABEL: @test_vcvttq_x_f32_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> undef, <8 x half> [[A:%.*]], i32 1, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vcvttq_x_f32_f16(float16x8_t a, mve_pred16_t p)
+{
+    return vcvttq_x_f32_f16(a, p);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 743d935..64c1faa 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -911,8 +911,22 @@ multiclass MVEPredicatedM<list<LLVMType> rets, list<LLVMType> params,
           LLVMMatchType<0>, rets[0])], props>;
 }
 
+// Intrinsic with a predicated and a non-predicated case. The predicated case
+// has two additional parameters: inactive (the value for inactive lanes, can
+// be undef) and predicate.
+multiclass MVEMXPredicated<list<LLVMType> rets, list<LLVMType> flags,
+                           list<LLVMType> params, LLVMType inactive,
+                           LLVMType predicate,
+                           list<IntrinsicProperty> props = [IntrNoMem]> {
+  def "":          Intrinsic<rets, flags # params, props>;
+  def _predicated: Intrinsic<rets, flags # [inactive] # params # [predicate],
+                             props>;
+}
+
 defm int_arm_mve_vcvt_narrow: MVEPredicated<[llvm_v8f16_ty],
    [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], llvm_v4i1_ty>;
+defm int_arm_mve_vcvt_widen: MVEMXPredicated<[llvm_v4f32_ty], [],
+   [llvm_v8f16_ty, llvm_i32_ty], llvm_v4f32_ty, llvm_v4i1_ty>;
 
 defm int_arm_mve_vldr_gather_base: MVEPredicated<
    [llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty],
@@ -1044,18 +1058,6 @@ def int_arm_mve_vmull_poly: Intrinsic<
    [llvm_anyvector_ty],
    [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrNoMem]>;
 
-// Intrinsic with a predicated and a non-predicated case. The predicated case
-// has two additional parameters: inactive (the value for inactive lanes, can
-// be undef) and predicate.
-multiclass MVEMXPredicated<list<LLVMType> rets, list<LLVMType> flags,
-                           list<LLVMType> params, LLVMType inactive,
-                           LLVMType predicate,
-                           list<IntrinsicProperty> props = [IntrNoMem]> {
-  def "":          Intrinsic<rets, flags # params, props>;
-  def _predicated: Intrinsic<rets, flags # [inactive] # params # [predicate],
-                             props>;
-}
-
 // The first two parameters are compile-time constants:
 // * Halving: 0 means  halving (vhcaddq), 1 means non-halving (vcaddq) 
 //            instruction. Note: the flag is inverted to match the corresonding
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 3942602..b4b3c68 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -4515,6 +4515,17 @@ multiclass MVE_VCVT_f2h_m<string iname, int half> {
 
 multiclass MVE_VCVT_h2f_m<string iname, int half> {
   def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half, (ins), vpred_r, "">;
+  defvar Inst = !cast<Instruction>(NAME);
+
+  let Predicates = [HasMVEFloat] in {
+    def : Pat<(v4f32 (int_arm_mve_vcvt_widen (v8f16 MQPR:$Qm), (i32 half))),
+              (v4f32 (Inst (v8f16 MQPR:$Qm)))>;
+    def : Pat<(v4f32 (int_arm_mve_vcvt_widen_predicated
+                         (v4f32 MQPR:$inactive), (v8f16 MQPR:$Qm), (i32 half),
+                         (v4i1 VCCR:$mask))),
+              (v4f32 (Inst (v8f16 MQPR:$Qm), ARMVCCThen,
+                           (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>;
+  }
 }
 
 defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>;
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll
index 901b88f..e74b7cd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll
@@ -6,6 +6,8 @@ declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
 
 declare <8 x half> @llvm.arm.mve.vcvt.narrow(<8 x half>, <4 x float>, i32)
 declare <8 x half> @llvm.arm.mve.vcvt.narrow.predicated(<8 x half>, <4 x float>, i32, <4 x i1>)
+declare <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half>, i32)
+declare <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float>, <8 x half>, i32, <4 x i1>)
 
 declare <8 x half> @llvm.arm.mve.vcvt.fix.v8f16.v8i16(i32, <8 x i16>, i32)
 declare <4 x float> @llvm.arm.mve.vcvt.fix.v4f32.v4i32(i32, <4 x i32>, i32)
@@ -367,3 +369,51 @@ entry:
   %2 = call <4 x i32> @llvm.arm.mve.vcvt.fix.predicated.v4i32.v4f32.v4i1(i32 1, <4 x i32> undef, <4 x float> %a, i32 32, <4 x i1> %1)
   ret <4 x i32> %2
 }
+
+define arm_aapcs_vfpcc <4 x float> @test_vcvtbq_f32_f16(<8 x half> %a) {
+; CHECK-LABEL: test_vcvtbq_f32_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> %a, i32 0)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vcvttq_f32_f16(<8 x half> %a) {
+; CHECK-LABEL: test_vcvttq_f32_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> %a, i32 1)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vcvtbq_m_f32_f16(<4 x float> %inactive, <8 x half> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vcvtbq_m_f32_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcvtbt.f32.f16 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> %inactive, <8 x half> %a, i32 0, <4 x i1> %1)
+  ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vcvttq_m_f32_f16(<4 x float> %inactive, <8 x half> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vcvttq_m_f32_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcvttt.f32.f16 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> %inactive, <8 x half> %a, i32 1, <4 x i1> %1)
+  ret <4 x float> %2
+}
-- 
2.7.4