From d62500dca72c06b962a811016fe70d64dbb47879 Mon Sep 17 00:00:00 2001 From: Caroline Concatto Date: Mon, 23 Jan 2023 17:15:34 +0000 Subject: [PATCH] [AArch64][SME2] Add Multi-vector saturating extract narrow intrinsics Add the following intrinsic: SQCVT SQCVTU UQCVT NOTE: These intrinsics are still in development and are subject to future changes. Reviewed By: kmclaughlin Differential Revision: https://reviews.llvm.org/D142035 --- llvm/include/llvm/IR/IntrinsicsAArch64.td | 15 +++ llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td | 18 +-- llvm/lib/Target/AArch64/SMEInstrFormats.td | 13 +- llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll | 141 ++++++++++++++++++++++ 4 files changed, 175 insertions(+), 12 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 6e0459b..ca14f62 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2780,6 +2780,11 @@ let TargetPrefix = "aarch64" in { [llvm_nxv4f32_ty, llvm_nxv4f32_ty], [IntrNoMem]>; + class SME2_CVT_VG4_SINGLE_Intrinsic + : DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>], + [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + class SME2_CVT_FtoI_VG2_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>], @@ -2874,4 +2879,14 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sve_fcvtu_x4 : SME2_CVT_FtoI_VG4_Intrinsic; def int_aarch64_sve_scvtf_x4 : SME2_CVT_ItoF_VG4_Intrinsic; def int_aarch64_sve_ucvtf_x4 : SME2_CVT_ItoF_VG4_Intrinsic; + + // + // Multi-vector saturating extract narrow + // + def int_aarch64_sve_sqcvt_x2 : SME2_CVT_VG2_SINGLE_Intrinsic; + def int_aarch64_sve_uqcvt_x2 : SME2_CVT_VG2_SINGLE_Intrinsic; + def int_aarch64_sve_sqcvtu_x2 : SME2_CVT_VG2_SINGLE_Intrinsic; + def int_aarch64_sve_sqcvt_x4 : SME2_CVT_VG4_SINGLE_Intrinsic; + def int_aarch64_sve_uqcvt_x4 : SME2_CVT_VG4_SINGLE_Intrinsic; + def int_aarch64_sve_sqcvtu_x4 : SME2_CVT_VG4_SINGLE_Intrinsic; } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 6572284..ba9b4b9 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -368,15 +368,15 @@ defm FCVTN_Z2Z_StoH : sme2_cvt_vg2_single<"fcvtn", 0b0001, nxv8f16, nxv4f32, i defm BFCVT_Z2Z_StoH : sme2_cvt_vg2_single<"bfcvt", 0b1000, nxv8bf16, nxv4f32, int_aarch64_sve_bfcvt_x2>; defm BFCVTN_Z2Z_StoH : sme2_cvt_vg2_single<"bfcvtn", 0b1001, nxv8bf16, nxv4f32, int_aarch64_sve_bfcvtn_x2>; -defm SQCVT_Z2Z_StoH : sme2_cvt_vg2_single<"sqcvt", 0b0110, nxv8i16, nxv4i32, null_frag>; -defm UQCVT_Z2Z_StoH : sme2_cvt_vg2_single<"uqcvt", 0b0111, nxv8i16, nxv4i32, null_frag>; -defm SQCVTU_Z2Z_StoH : sme2_cvt_vg2_single<"sqcvtu", 0b1110, nxv8i16, nxv4i32, null_frag>; -defm SQCVT_Z4Z : sme2_int_cvt_vg4_single<"sqcvt", 0b000>; -defm UQCVT_Z4Z : sme2_int_cvt_vg4_single<"uqcvt", 0b001>; -defm SQCVTU_Z4Z : sme2_int_cvt_vg4_single<"sqcvtu", 0b100>; -defm SQCVTN_Z4Z : sme2_int_cvt_vg4_single<"sqcvtn", 0b010>; -defm SQCVTUN_Z4Z : sme2_int_cvt_vg4_single<"sqcvtun", 0b110>; -defm UQCVTN_Z4Z : sme2_int_cvt_vg4_single<"uqcvtn", 0b011>; +defm SQCVT_Z2Z_StoH : sme2_cvt_vg2_single<"sqcvt", 0b0110, nxv8i16, nxv4i32, int_aarch64_sve_sqcvt_x2>; +defm UQCVT_Z2Z_StoH : sme2_cvt_vg2_single<"uqcvt", 0b0111, nxv8i16, nxv4i32, int_aarch64_sve_uqcvt_x2>; +defm SQCVTU_Z2Z_StoH : sme2_cvt_vg2_single<"sqcvtu", 0b1110, nxv8i16, nxv4i32, int_aarch64_sve_sqcvtu_x2>; +defm SQCVT_Z4Z : sme2_int_cvt_vg4_single<"sqcvt", 0b000, int_aarch64_sve_sqcvt_x4>; +defm UQCVT_Z4Z : sme2_int_cvt_vg4_single<"uqcvt", 0b001, int_aarch64_sve_uqcvt_x4>; +defm SQCVTU_Z4Z : sme2_int_cvt_vg4_single<"sqcvtu", 0b100, int_aarch64_sve_sqcvtu_x4>; +defm SQCVTN_Z4Z : sme2_int_cvt_vg4_single<"sqcvtn", 0b010, null_frag>; +defm SQCVTUN_Z4Z : sme2_int_cvt_vg4_single<"sqcvtun", 0b110, null_frag>; +defm UQCVTN_Z4Z : sme2_int_cvt_vg4_single<"uqcvtn", 0b011, null_frag>; defm FCVTZS_2Z2Z_StoS : sme2_fp_cvt_vg2_multi<"fcvtzs", 0b00010>; defm FCVTZS_4Z4Z_StoS : sme2_fp_cvt_vg4_multi<"fcvtzs", 0b0001000>; diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index c1decff..33ff5f0 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -137,6 +137,10 @@ class SME2_Cvt_VG2_Pat(name) (REG_SEQUENCE ZPR2Mul2, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1))>; +class SME2_Cvt_VG4_Pat + : Pat<(out_vt (intrinsic in_vt:$Zn1, in_vt:$Zn2, in_vt:$Zn3, in_vt:$Zn4)), + (!cast(name) (REG_SEQUENCE ZPR4Mul4, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1, in_vt:$Zn3, zsub2, in_vt:$Zn4, zsub3))>; + //===----------------------------------------------------------------------===// // SME Outer Products //===----------------------------------------------------------------------===// @@ -2130,9 +2134,12 @@ class sme2_cvt_vg4_single op, RegisterOperand first_ty, } // SME2 multi-vec int down convert four registers -multiclass sme2_int_cvt_vg4_single op> { -def _StoB : sme2_cvt_vg4_single<0, op, ZPR8, ZZZZ_s_mul_r, mnemonic>; -def _DtoH : sme2_cvt_vg4_single<1, op, ZPR16, ZZZZ_d_mul_r, mnemonic>; +multiclass sme2_int_cvt_vg4_single op, SDPatternOperator intrinsic> { + def _StoB : sme2_cvt_vg4_single<0, op, ZPR8, ZZZZ_s_mul_r, mnemonic>; + def _DtoH : sme2_cvt_vg4_single<1, op, ZPR16, ZZZZ_d_mul_r, mnemonic>; + + def : SME2_Cvt_VG4_Pat; + def : SME2_Cvt_VG4_Pat; } class sme2_unpk_vector_vg4sz, bit u, RegisterOperand first_ty, diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll new file mode 100644 index 0000000..9e4bc17d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll @@ -0,0 +1,141 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py$ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; +; SQCVT +; + +; x2 +define @multi_vector_qcvt_x2_s16_s32( %unused, %zn1, %zn2) { +; CHECK-LABEL: multi_vector_qcvt_x2_s16_s32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: sqcvt z0.h, { z2.s, z3.s } +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.sqcvt.x2.nxv4i32( %zn1, %zn2) + ret %res +} + +; x4 +define @multi_vector_qcvt_x4_s8_s32( %unused, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: multi_vector_qcvt_x4_s8_s32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: sqcvt z0.b, { z4.s - z7.s } +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.sqcvt.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) + ret %res +} + +define @multi_vector_qcvt_x4_s16_s64( %unused, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: multi_vector_qcvt_x4_s16_s64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: sqcvt z0.h, { z4.d - z7.d } +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.sqcvt.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) + ret %res +} + +; +; UQCVT +; + +; x2 +define @multi_vector_qcvt_x2_u16_u32( %unused, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_qcvt_x2_u16_u32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: uqcvt z0.h, { z2.s, z3.s } +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uqcvt.x2.nxv4i32( %zn0, %zn1) + ret %res +} + +; x4 +define @multi_vector_qcvt_x4_u8_u32( %unused, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: multi_vector_qcvt_x4_u8_u32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: uqcvt z0.b, { z4.s - z7.s } +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uqcvt.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) + ret %res +} + +define @multi_vector_qcvt_x4_u16_u64( %unused, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: multi_vector_qcvt_x4_u16_u64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: uqcvt z0.h, { z4.d - z7.d } +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uqcvt.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) + ret %res +} + +; +; SQCVTU +; + +; x2 +define @multi_vector_qcvt_x2_s16_u32( %unused, %zn1, %zn2) { +; CHECK-LABEL: multi_vector_qcvt_x2_s16_u32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: sqcvtu z0.h, { z2.s, z3.s } +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.sqcvtu.x2.nxv4i32( %zn1, %zn2) + ret %res +} + +; x4 +define @multi_vector_qcvt_x4_u8_s32( %unused, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: multi_vector_qcvt_x4_u8_s32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: sqcvtu z0.b, { z4.s - z7.s } +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.sqcvtu.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) + ret %res +} + +define @multi_vector_qcvt_x4_u16_s64( %unused, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: multi_vector_qcvt_x4_u16_s64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: sqcvtu z0.h, { z4.d - z7.d } +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.sqcvtu.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) + ret %res +} + +declare @llvm.aarch64.sve.sqcvt.x2.nxv4i32(, ) +declare @llvm.aarch64.sve.uqcvt.x2.nxv4i32(, ) +declare @llvm.aarch64.sve.sqcvtu.x2.nxv4i32(, ) +declare @llvm.aarch64.sve.sqcvt.x4.nxv4i32(, , , ) +declare @llvm.aarch64.sve.sqcvt.x4.nxv2i64(, , , ) +declare @llvm.aarch64.sve.uqcvt.x4.nxv4i32(, , , ) +declare @llvm.aarch64.sve.uqcvt.x4.nxv2i64(, , , ) +declare @llvm.aarch64.sve.sqcvtu.x4.nxv4i32(, , , ) +declare @llvm.aarch64.sve.sqcvtu.x4.nxv2i64(, , , ) -- 2.7.4