From 5c2c94648e424d1c2ccfd7bd3c7147bb6389198d Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Wed, 30 Oct 2019 11:44:29 +0000 Subject: [PATCH] [AArch64][SVE] Implement masked store intrinsics Summary: Adds support for codegen of masked stores, with non-truncating and truncating variants. Reviewers: huntergr, greened, dmgreen, rovka, sdesmalen Reviewed By: dmgreen, sdesmalen Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, cfe-commits, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D69378 --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 28 ++++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 30 +++++ .../Target/AArch64/AArch64TargetTransformInfo.h | 10 +- .../test/CodeGen/AArch64/sve-masked-ldst-nonext.ll | 148 ++++++++++++++++++--- llvm/test/CodeGen/AArch64/sve-masked-ldst-trunc.ll | 66 +++++++++ 5 files changed, 260 insertions(+), 22 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-masked-ldst-trunc.ll diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 4e1be26..c0cb471 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -309,6 +309,34 @@ def zext_masked_load_i32 : return cast(N)->getMemoryVT().getScalarType() == MVT::i32; }]>; +// non-truncating masked store fragment. +def nontrunc_masked_store : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, node:$pred), [{ + return !cast(N)->isTruncatingStore(); +}]>; +// truncating masked store fragments. +def trunc_masked_store : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, node:$pred), [{ + return cast(N)->isTruncatingStore(); +}]>; +def trunc_masked_store_i8 : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (trunc_masked_store node:$val, node:$ptr, node:$pred), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def trunc_masked_store_i16 : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (trunc_masked_store node:$val, node:$ptr, node:$pred), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; +def trunc_masked_store_i32 : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (trunc_masked_store node:$val, node:$ptr, node:$pred), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + // Node definitions. def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>; def AArch64adr : SDNode<"AArch64ISD::ADR", SDTIntUnaryOp, []>; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 290bb32..502de47 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1108,6 +1108,36 @@ let Predicates = [HasSVE] in { // 16-element contiguous loads defm : pred_load; + multiclass pred_store { + def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>; + } + + // 2-element contiguous stores + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + + // 4-element contiguous stores + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + + // 8-element contiguous stores + defm : pred_store; + defm : pred_store; + defm : pred_store; + + // 16-element contiguous stores + defm : pred_store; + } let Predicates = [HasSVE2] in { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 9956c9c..8f313e3 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -147,7 +147,7 @@ public: bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); - bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) { + bool isLegalMaskedLoadStore(Type *DataType, MaybeAlign Alignment) { if (!isa(DataType) || !ST->hasSVE()) return false; @@ -162,6 +162,14 @@ public: return false; } + bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) { + return isLegalMaskedLoadStore(DataType, Alignment); + } + + bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) { + return isLegalMaskedLoadStore(DataType, Alignment); + } + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, diff --git a/llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll b/llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll index 461d290..b754266 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll @@ -1,79 +1,173 @@ -; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -asm-verbose=0 < %s | FileCheck %s ; ; Masked Loads ; -define @masked_load_nxv2i64( *%a, %mask) { +define @masked_load_nxv2i64( *%a, %mask) nounwind { ; CHECK-LABEL: masked_load_nxv2i64: -; CHECK: ld1d { [[IN:z[0-9]+]].d }, [[PG:p[0-9]+]]/z, [x0] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv2i64( *%a, i32 8, %mask, undef) ret %load } -define @masked_load_nxv4i32( *%a, %mask) { +define @masked_load_nxv4i32( *%a, %mask) nounwind { ; CHECK-LABEL: masked_load_nxv4i32: -; CHECK: ld1w { [[IN:z[0-9]+]].s }, [[PG:p[0-9]+]]/z, [x0] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv4i32( *%a, i32 4, %mask, undef) ret %load } -define @masked_load_nxv8i16( *%a, %mask) { +define @masked_load_nxv8i16( *%a, %mask) nounwind { ; CHECK-LABEL: masked_load_nxv8i16: -; CHECK: ld1h { [[IN:z[0-9]+]].h }, [[PG:p[0-9]+]]/z, [x0] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv8i16( *%a, i32 2, %mask, undef) ret %load } -define @masked_load_nxv16i8( *%a, %mask) { +define @masked_load_nxv16i8( *%a, %mask) nounwind { ; CHECK-LABEL: masked_load_nxv16i8: -; CHECK: ld1b { [[IN:z[0-9]+]].b }, [[PG:p[0-9]+]]/z, [x0] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv16i8( *%a, i32 1, %mask, undef) ret %load } -define @masked_load_nxv2f64( *%a, %mask) { +define @masked_load_nxv2f64( *%a, %mask) nounwind { ; CHECK-LABEL: masked_load_nxv2f64: -; CHECK: ld1d { [[IN:z[0-9]+]].d }, [[PG:p[0-9]+]]/z, [x0] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv2f64( *%a, i32 8, %mask, undef) ret %load } -define @masked_load_nxv2f32( *%a, %mask) { +define @masked_load_nxv2f32( *%a, %mask) nounwind { ; CHECK-LABEL: masked_load_nxv2f32: -; CHECK: ld1w { [[IN:z[0-9]+]].d }, [[PG:p[0-9]+]]/z, [x0] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv2f32( *%a, i32 4, %mask, undef) ret %load } -define @masked_load_nxv2f16( *%a, %mask) { +define @masked_load_nxv2f16( *%a, %mask) nounwind { ; CHECK-LABEL: masked_load_nxv2f16: -; CHECK: ld1h { [[IN:z[0-9]+]].d }, [[PG:p[0-9]+]]/z, [x0] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv2f16( *%a, i32 2, %mask, undef) ret %load } -define @masked_load_nxv4f32( *%a, %mask) { +define @masked_load_nxv4f32( *%a, %mask) nounwind { ; CHECK-LABEL: masked_load_nxv4f32: -; CHECK: ld1w { [[IN:z[0-9]+]].s }, [[PG:p[0-9]+]]/z, [x0] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv4f32( *%a, i32 4, %mask, undef) ret %load } -define @masked_load_nxv4f16( *%a, %mask) { +define @masked_load_nxv4f16( *%a, %mask) nounwind { ; CHECK-LABEL: masked_load_nxv4f16: -; CHECK: ld1h { [[IN:z[0-9]+]].s }, [[PG:p[0-9]+]]/z, [x0] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv4f16( *%a, i32 2, %mask, undef) ret %load } -define @masked_load_nxv8f16( *%a, %mask) { +define @masked_load_nxv8f16( *%a, %mask) nounwind { ; CHECK-LABEL: masked_load_nxv8f16: -; CHECK: ld1h { [[IN:z[0-9]+]].h }, [[PG:p[0-9]+]]/z, [x0] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv8f16( *%a, i32 2, %mask, undef) ret %load } +; +; Masked Stores +; + +define void @masked_store_nxv2i64( *%a, %val, %mask) nounwind { +; CHECK-LABEL: masked_store_nxv2i64: +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv2i64( %val, *%a, i32 8, %mask) + ret void +} + +define void @masked_store_nxv4i32( *%a, %val, %mask) nounwind { +; CHECK-LABEL: masked_store_nxv4i32: +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv4i32( %val, *%a, i32 4, %mask) + ret void +} + +define void @masked_store_nxv8i16( *%a, %val, %mask) nounwind { +; CHECK-LABEL: masked_store_nxv8i16: +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv8i16( %val, *%a, i32 2, %mask) + ret void +} + +define void @masked_store_nxv16i8( *%a, %val, %mask) nounwind { +; CHECK-LABEL: masked_store_nxv16i8: +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv16i8( %val, *%a, i32 1, %mask) + ret void +} + +define void @masked_store_nxv2f64( *%a, %val, %mask) nounwind { +; CHECK-LABEL: masked_store_nxv2f64: +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv2f64( %val, *%a, i32 8, %mask) + ret void +} + +define void @masked_store_nxv2f32( *%a, %val, %mask) nounwind { +; CHECK-LABEL: masked_store_nxv2f32: +; CHECK-NEXT: st1w { z0.d }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv2f32( %val, *%a, i32 4, %mask) + ret void +} + +define void @masked_store_nxv2f16( *%a, %val, %mask) nounwind { +; CHECK-LABEL: masked_store_nxv2f16: +; CHECK-NEXT: st1h { z0.d }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv2f16( %val, *%a, i32 4, %mask) + ret void +} + +define void @masked_store_nxv4f32( *%a, %val, %mask) nounwind { +; CHECK-LABEL: masked_store_nxv4f32: +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv4f32( %val, *%a, i32 4, %mask) + ret void +} + +define void @masked_store_nxv4f16( *%a, %val, %mask) nounwind { +; CHECK-LABEL: masked_store_nxv4f16: +; CHECK-NEXT: st1h { z0.s }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv4f16( %val, *%a, i32 2, %mask) + ret void +} + +define void @masked_store_nxv8f16( *%a, %val, %mask) nounwind { +; CHECK-LABEL: masked_store_nxv8f16: +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv8f16( %val, *%a, i32 2, %mask) + ret void +} + declare @llvm.masked.load.nxv2i64(*, i32, , ) declare @llvm.masked.load.nxv4i32(*, i32, , ) declare @llvm.masked.load.nxv8i16(*, i32, , ) @@ -85,3 +179,15 @@ declare @llvm.masked.load.nxv2f16(*, i32, declare @llvm.masked.load.nxv4f32(*, i32, , ) declare @llvm.masked.load.nxv4f16(*, i32, , ) declare @llvm.masked.load.nxv8f16(*, i32, , ) + +declare void @llvm.masked.store.nxv2i64(, *, i32, ) +declare void @llvm.masked.store.nxv4i32(, *, i32, ) +declare void @llvm.masked.store.nxv8i16(, *, i32, ) +declare void @llvm.masked.store.nxv16i8(, *, i32, ) + +declare void @llvm.masked.store.nxv2f64(, *, i32, ) +declare void @llvm.masked.store.nxv2f32(, *, i32, ) +declare void @llvm.masked.store.nxv2f16(, *, i32, ) +declare void @llvm.masked.store.nxv4f32(, *, i32, ) +declare void @llvm.masked.store.nxv4f16(, *, i32, ) +declare void @llvm.masked.store.nxv8f16(, *, i32, ) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-ldst-trunc.ll b/llvm/test/CodeGen/AArch64/sve-masked-ldst-trunc.ll new file mode 100644 index 0000000..d2069da --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-masked-ldst-trunc.ll @@ -0,0 +1,66 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -asm-verbose=0 < %s | FileCheck %s + +; +; Masked Stores +; + +define void @masked_trunc_store_nxv2i8( *%a, %val, *%b, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_nxv2i8: +; CHECK-NEXT: st1b { z0.d }, p0, [x1] +; CHECK-NEXT: ret + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i8( %trunc, *%b, i32 8, %mask) + ret void +} + +define void @masked_trunc_store_nxv2i16( *%a, %val, *%b, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_nxv2i16: +; CHECK-NEXT: st1h { z0.d }, p0, [x1] +; CHECK-NEXT: ret + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i16( %trunc, *%b, i32 8, %mask) + ret void +} + +define void @masked_trunc_store_nxv2i32( *%a, %val, *%b, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_nxv2i32: +; CHECK-NEXT: st1w { z0.d }, p0, [x1] +; CHECK-NEXT: ret + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i32( %trunc, *%b, i32 8, %mask) + ret void +} + +define void @masked_trunc_store_nxv4i8( *%a, %val, *%b, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_nxv4i8: +; CHECK-NEXT: st1b { z0.s }, p0, [x1] +; CHECK-NEXT: ret + %trunc = trunc %val to + call void @llvm.masked.store.nxv4i8( %trunc, *%b, i32 4, %mask) + ret void +} + +define void @masked_trunc_store_nxv4i16( *%a, %val, *%b, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_nxv4i16: +; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: ret + %trunc = trunc %val to + call void @llvm.masked.store.nxv4i16( %trunc, *%b, i32 4, %mask) + ret void +} + +define void @masked_trunc_store_nxv8i8( *%a, %val, *%b, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_nxv8i8: +; CHECK-NEXT: st1b { z0.h }, p0, [x1] +; CHECK-NEXT: ret + %trunc = trunc %val to + call void @llvm.masked.store.nxv8i8( %trunc, *%b, i32 2, %mask) + ret void +} + +declare void @llvm.masked.store.nxv2i8(, *, i32, ) +declare void @llvm.masked.store.nxv2i16(, *, i32, ) +declare void @llvm.masked.store.nxv2i32(, *, i32, ) +declare void @llvm.masked.store.nxv4i8(, *, i32, ) +declare void @llvm.masked.store.nxv4i16(, *, i32, ) +declare void @llvm.masked.store.nxv8i8(, *, i32, ) -- 2.7.4