From d62500dca72c06b962a811016fe70d64dbb47879 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto@arm.com>
Date: Mon, 23 Jan 2023 17:15:34 +0000
Subject: [PATCH] [AArch64][SME2] Add Multi-vector saturating extract narrow
 intrinsics

Add the following intrinsic:
  SQCVT
  SQCVTU
  UQCVT

NOTE: These intrinsics are still in development and are subject to future changes.

Reviewed By: kmclaughlin

Differential Revision: https://reviews.llvm.org/D142035
---
 llvm/include/llvm/IR/IntrinsicsAArch64.td         |  15 +++
 llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td    |  18 +--
 llvm/lib/Target/AArch64/SMEInstrFormats.td        |  13 +-
 llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll | 141 ++++++++++++++++++++++
 4 files changed, 175 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 6e0459b..ca14f62 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2780,6 +2780,11 @@ let TargetPrefix = "aarch64" in {
                             [llvm_nxv4f32_ty, llvm_nxv4f32_ty],
                             [IntrNoMem]>;
 
+  class SME2_CVT_VG4_SINGLE_Intrinsic
+    : DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
+                            [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                            [IntrNoMem]>;
+
   class SME2_CVT_FtoI_VG2_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                             [LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
@@ -2874,4 +2879,14 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_fcvtu_x4 : SME2_CVT_FtoI_VG4_Intrinsic;
   def int_aarch64_sve_scvtf_x4 : SME2_CVT_ItoF_VG4_Intrinsic;
   def int_aarch64_sve_ucvtf_x4 : SME2_CVT_ItoF_VG4_Intrinsic;
+
+  //
+  // Multi-vector saturating extract narrow
+  //
+  def int_aarch64_sve_sqcvt_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_uqcvt_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvtu_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvt_x4  : SME2_CVT_VG4_SINGLE_Intrinsic;
+  def int_aarch64_sve_uqcvt_x4  : SME2_CVT_VG4_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvtu_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 6572284..ba9b4b9 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -368,15 +368,15 @@ defm FCVTN_Z2Z_StoH  : sme2_cvt_vg2_single<"fcvtn",  0b0001, nxv8f16, nxv4f32, i
 defm BFCVT_Z2Z_StoH  : sme2_cvt_vg2_single<"bfcvt",  0b1000, nxv8bf16, nxv4f32, int_aarch64_sve_bfcvt_x2>;
 defm BFCVTN_Z2Z_StoH : sme2_cvt_vg2_single<"bfcvtn", 0b1001, nxv8bf16, nxv4f32, int_aarch64_sve_bfcvtn_x2>;
 
-defm SQCVT_Z2Z_StoH  : sme2_cvt_vg2_single<"sqcvt",  0b0110, nxv8i16, nxv4i32, null_frag>;
-defm UQCVT_Z2Z_StoH  : sme2_cvt_vg2_single<"uqcvt",  0b0111, nxv8i16, nxv4i32, null_frag>;
-defm SQCVTU_Z2Z_StoH : sme2_cvt_vg2_single<"sqcvtu", 0b1110, nxv8i16, nxv4i32, null_frag>;
-defm SQCVT_Z4Z      : sme2_int_cvt_vg4_single<"sqcvt", 0b000>;
-defm UQCVT_Z4Z      : sme2_int_cvt_vg4_single<"uqcvt", 0b001>;
-defm SQCVTU_Z4Z     : sme2_int_cvt_vg4_single<"sqcvtu", 0b100>;
-defm SQCVTN_Z4Z     : sme2_int_cvt_vg4_single<"sqcvtn", 0b010>;
-defm SQCVTUN_Z4Z    : sme2_int_cvt_vg4_single<"sqcvtun", 0b110>;
-defm UQCVTN_Z4Z     : sme2_int_cvt_vg4_single<"uqcvtn", 0b011>;
+defm SQCVT_Z2Z_StoH  : sme2_cvt_vg2_single<"sqcvt",  0b0110, nxv8i16, nxv4i32, int_aarch64_sve_sqcvt_x2>;
+defm UQCVT_Z2Z_StoH  : sme2_cvt_vg2_single<"uqcvt",  0b0111, nxv8i16, nxv4i32, int_aarch64_sve_uqcvt_x2>;
+defm SQCVTU_Z2Z_StoH : sme2_cvt_vg2_single<"sqcvtu", 0b1110, nxv8i16, nxv4i32, int_aarch64_sve_sqcvtu_x2>;
+defm SQCVT_Z4Z      : sme2_int_cvt_vg4_single<"sqcvt", 0b000, int_aarch64_sve_sqcvt_x4>;
+defm UQCVT_Z4Z      : sme2_int_cvt_vg4_single<"uqcvt", 0b001, int_aarch64_sve_uqcvt_x4>;
+defm SQCVTU_Z4Z     : sme2_int_cvt_vg4_single<"sqcvtu", 0b100, int_aarch64_sve_sqcvtu_x4>;
+defm SQCVTN_Z4Z     : sme2_int_cvt_vg4_single<"sqcvtn", 0b010, null_frag>;
+defm SQCVTUN_Z4Z    : sme2_int_cvt_vg4_single<"sqcvtun", 0b110, null_frag>;
+defm UQCVTN_Z4Z     : sme2_int_cvt_vg4_single<"uqcvtn", 0b011, null_frag>;
 
 defm FCVTZS_2Z2Z_StoS : sme2_fp_cvt_vg2_multi<"fcvtzs", 0b00010>;
 defm FCVTZS_4Z4Z_StoS : sme2_fp_cvt_vg4_multi<"fcvtzs", 0b0001000>;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index c1decff..33ff5f0 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -137,6 +137,10 @@ class SME2_Cvt_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_v
     : Pat<(out_vt (intrinsic in_vt:$Zn1, in_vt:$Zn2)),
                   (!cast<Instruction>(name) (REG_SEQUENCE ZPR2Mul2, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1))>;
 
+class SME2_Cvt_VG4_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt>
+    : Pat<(out_vt (intrinsic in_vt:$Zn1, in_vt:$Zn2, in_vt:$Zn3, in_vt:$Zn4)),
+                  (!cast<Instruction>(name) (REG_SEQUENCE ZPR4Mul4, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1, in_vt:$Zn3, zsub2, in_vt:$Zn4, zsub3))>;
+
 //===----------------------------------------------------------------------===//
 // SME Outer Products
 //===----------------------------------------------------------------------===//
@@ -2130,9 +2134,12 @@ class sme2_cvt_vg4_single<bit sz, bits<3> op, RegisterOperand first_ty,
 }
 
 // SME2 multi-vec int down convert four registers
-multiclass sme2_int_cvt_vg4_single<string mnemonic, bits<3> op> {
-def _StoB : sme2_cvt_vg4_single<0, op, ZPR8, ZZZZ_s_mul_r, mnemonic>;
-def _DtoH : sme2_cvt_vg4_single<1, op, ZPR16, ZZZZ_d_mul_r, mnemonic>;
+multiclass sme2_int_cvt_vg4_single<string mnemonic, bits<3> op, SDPatternOperator intrinsic> {
+  def _StoB : sme2_cvt_vg4_single<0, op, ZPR8, ZZZZ_s_mul_r, mnemonic>;
+  def _DtoH : sme2_cvt_vg4_single<1, op, ZPR16, ZZZZ_d_mul_r, mnemonic>;
+
+  def : SME2_Cvt_VG4_Pat<NAME # _StoB, intrinsic, nxv16i8, nxv4i32>;
+  def : SME2_Cvt_VG4_Pat<NAME # _DtoH, intrinsic, nxv8i16, nxv2i64>;
 }
 
 class sme2_unpk_vector_vg4<bits<2>sz, bit u, RegisterOperand first_ty,
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll
new file mode 100644
index 0000000..9e4bc17d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py$
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+
+;
+; SQCVT
+;
+
+; x2
+define <vscale x 8 x i16 > @multi_vector_qcvt_x2_s16_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) {
+; CHECK-LABEL: multi_vector_qcvt_x2_s16_s32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    sqcvt z0.h, { z2.s, z3.s }
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.sqcvt.x2.nxv4i32(<vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2)
+  ret <vscale x 8 x i16> %res
+}
+
+; x4
+define <vscale x 16 x i8 > @multi_vector_qcvt_x4_s8_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) {
+; CHECK-LABEL: multi_vector_qcvt_x4_s8_s32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    sqcvt z0.b, { z4.s - z7.s }
+; CHECK-NEXT:    ret
+  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.sqcvt.x4.nxv4i32(<vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @multi_vector_qcvt_x4_s16_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) {
+; CHECK-LABEL: multi_vector_qcvt_x4_s16_s64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    sqcvt z0.h, { z4.d - z7.d }
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.sqcvt.x4.nxv2i64(<vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4)
+  ret <vscale x 8 x i16> %res
+}
+
+;
+; UQCVT
+;
+
+; x2
+define <vscale x 8 x i16> @multi_vector_qcvt_x2_u16_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) {
+; CHECK-LABEL: multi_vector_qcvt_x2_u16_u32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    uqcvt z0.h, { z2.s, z3.s }
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.uqcvt.x2.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
+  ret<vscale x 8 x i16> %res
+}
+
+; x4
+define <vscale x 16 x i8> @multi_vector_qcvt_x4_u8_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) {
+; CHECK-LABEL: multi_vector_qcvt_x4_u8_u32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    uqcvt z0.b, { z4.s - z7.s }
+; CHECK-NEXT:    ret
+  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.uqcvt.x4.nxv4i32(<vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @multi_vector_qcvt_x4_u16_u64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) {
+; CHECK-LABEL: multi_vector_qcvt_x4_u16_u64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    uqcvt z0.h, { z4.d - z7.d }
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.uqcvt.x4.nxv2i64(<vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4)
+  ret <vscale x 8 x i16> %res
+}
+
+;
+; SQCVTU
+;
+
+; x2
+define <vscale x 8 x i16 > @multi_vector_qcvt_x2_s16_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) {
+; CHECK-LABEL: multi_vector_qcvt_x2_s16_u32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    sqcvtu z0.h, { z2.s, z3.s }
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.sqcvtu.x2.nxv4i32(<vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2)
+  ret <vscale x 8 x i16> %res
+}
+
+; x4
+define <vscale x 16 x i8> @multi_vector_qcvt_x4_u8_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) {
+; CHECK-LABEL: multi_vector_qcvt_x4_u8_s32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    sqcvtu z0.b, { z4.s - z7.s }
+; CHECK-NEXT:    ret
+  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.sqcvtu.x4.nxv4i32(<vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @multi_vector_qcvt_x4_u16_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) {
+; CHECK-LABEL: multi_vector_qcvt_x4_u16_s64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    sqcvtu z0.h, { z4.d - z7.d }
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.sqcvtu.x4.nxv2i64(<vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4)
+  ret <vscale x 8 x i16> %res
+}
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sqcvt.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.uqcvt.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sqcvtu.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 16 x i8> @llvm.aarch64.sve.sqcvt.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sqcvt.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 16 x i8> @llvm.aarch64.sve.uqcvt.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.uqcvt.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 16 x i8> @llvm.aarch64.sve.sqcvtu.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sqcvtu.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
-- 
2.7.4