From 17f6e18acf5bedd505a7a2194b556fc6f559ffb4 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Wed, 22 Apr 2020 09:38:48 +0100 Subject: [PATCH] [AArch64][SVE] Add SVE intrinsic for LD1RQ Summary: Adds the following intrinsic for contiguous load & replicate: - @llvm.aarch64.sve.ld1rq The LD1RQ intrinsic only needs the SImmS16XForm added by this patch. The others (SImmS2XForm, SImmS3XForm & SImmS4XForm) were added for consistency. Reviewers: andwar, sdesmalen, efriedma, cameron.mcinally, dancgr, rengolin Reviewed By: sdesmalen Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, danielkiss, cfe-commits, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D76929 --- llvm/include/llvm/IR/IntrinsicsAArch64.td | 2 + llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 21 +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 1 + llvm/lib/Target/AArch64/AArch64InstrFormats.td | 21 ++- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 29 ++++ llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll | 181 ++++++++++++++++++++++ 6 files changed, 251 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index c32fd48..5edbe70 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1307,6 +1307,8 @@ def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic; def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredLoad_Intrinsic; def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredLoad_Intrinsic; +def int_aarch64_sve_ld1rq : AdvSIMD_1Vec_PredLoad_Intrinsic; + // // Stores // diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index cd19696..ba31520 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1424,6 +1424,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::LDNF1S: return "AArch64ISD::LDNF1S"; case AArch64ISD::LDFF1: return "AArch64ISD::LDFF1"; case AArch64ISD::LDFF1S: return "AArch64ISD::LDFF1S"; + case AArch64ISD::LD1RQ: return "AArch64ISD::LD1RQ"; case AArch64ISD::GLD1: return "AArch64ISD::GLD1"; case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED"; case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW"; @@ -11622,6 +11623,24 @@ static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { return L; } +static SDValue performLD1RQCombine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + EVT LoadVT = VT; + if (VT.isFloatingPoint()) + LoadVT = VT.changeTypeToInteger(); + + SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)}; + SDValue Load = DAG.getNode(AArch64ISD::LD1RQ, DL, {LoadVT, MVT::Other}, Ops); + SDValue LoadChain = SDValue(Load.getNode(), 1); + + if (VT.isFloatingPoint()) + Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0)); + + return DAG.getMergeValues({ Load, LoadChain }, DL); +} + static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue Data = N->getOperand(2); @@ -13211,6 +13230,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performNEONPostLDSTCombine(N, DCI, DAG); case Intrinsic::aarch64_sve_ldnt1: return performLDNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_ld1rq: + return performLD1RQCombine(N, DAG); case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); case Intrinsic::aarch64_sve_ldnt1_gather: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 23ee452..8749d808 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -226,6 +226,7 @@ enum NodeType : unsigned { LDNF1S, LDFF1, LDFF1S, + LD1RQ, // Unsigned gather loads. GLD1, diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 9d254bd..061e2a0 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -483,6 +483,19 @@ def uimm6s16 : Operand, ImmLeafgetTargetConstant(N->getSExtValue() / 2, SDLoc(N), MVT::i64); +}]>; +def SImmS3XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 3, SDLoc(N), MVT::i64); +}]>; +def SImmS4XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 4, SDLoc(N), MVT::i64); +}]>; +def SImmS16XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64); +}]>; + // simm6sN predicate - True if the immediate is a multiple of N in the range // [-32 * N, 31 * N]. def SImm6s1Operand : SImmScaledMemoryIndexed<6, 1>; @@ -506,27 +519,27 @@ def simm4s1 : Operand, ImmLeaf, ImmLeaf=-16 && Imm <= 14 && (Imm % 2) == 0x0; }]> { +[{ return Imm >=-16 && Imm <= 14 && (Imm % 2) == 0x0; }], SImmS2XForm> { let PrintMethod = "printImmScale<2>"; let ParserMatchClass = SImm4s2Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s3 : Operand, ImmLeaf=-24 && Imm <= 21 && (Imm % 3) == 0x0; }]> { +[{ return Imm >=-24 && Imm <= 21 && (Imm % 3) == 0x0; }], SImmS3XForm> { let PrintMethod = "printImmScale<3>"; let ParserMatchClass = SImm4s3Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s4 : Operand, ImmLeaf=-32 && Imm <= 28 && (Imm % 4) == 0x0; }]> { +[{ return Imm >=-32 && Imm <= 28 && (Imm % 4) == 0x0; }], SImmS4XForm> { let PrintMethod = "printImmScale<4>"; let ParserMatchClass = SImm4s4Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s16 : Operand, ImmLeaf=-128 && Imm <= 112 && (Imm % 16) == 0x0; }]> { +[{ return Imm >=-128 && Imm <= 112 && (Imm % 16) == 0x0; }], SImmS16XForm> { let PrintMethod = "printImmScale<16>"; let ParserMatchClass = SImm4s16Operand; let DecoderMethod = "DecodeSImm<4>"; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 1712cd2..52e0377 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -31,6 +31,16 @@ def AArch64ldff1 : SDNode<"AArch64ISD::LDFF1", SDT_AArch64_LD1, [SDNPHasChain, S def AArch64ldnf1s : SDNode<"AArch64ISD::LDNF1S", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; def AArch64ldff1s : SDNode<"AArch64ISD::LDFF1S", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +// Contiguous load and replicate - node definitions +// + +def SDT_AArch64_LD1RQ : SDTypeProfile<1, 2, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64ld1rq : SDNode<"AArch64ISD::LD1RQ", SDT_AArch64_LD1RQ, [SDNPHasChain, SDNPMayLoad]>; + // Gather loads - node definitions // def SDT_AArch64_GATHER_SV : SDTypeProfile<1, 4, [ @@ -1317,6 +1327,25 @@ multiclass sve_prefetch; + // LD1R of 128-bit masked data + def : Pat<(nxv16i8 (AArch64ld1rq PPR:$gp, GPR64:$base)), + (LD1RQ_B_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv8i16 (AArch64ld1rq PPR:$gp, GPR64:$base)), + (LD1RQ_H_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv4i32 (AArch64ld1rq PPR:$gp, GPR64:$base)), + (LD1RQ_W_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv2i64 (AArch64ld1rq PPR:$gp, GPR64:$base)), + (LD1RQ_D_IMM $gp, $base, (i64 0))>; + + def : Pat<(nxv16i8 (AArch64ld1rq PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_B_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv8i16 (AArch64ld1rq PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_H_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv4i32 (AArch64ld1rq PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_W_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv2i64 (AArch64ld1rq PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8), (SXTB_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll index 8f180d0..74717d3 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll @@ -1,6 +1,179 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s ; +; LD1RQB +; + +define @ld1rqb_i8( %pred, i8* %addr) { +; CHECK-LABEL: ld1rqb_i8: +; CHECK: ld1rqb { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv16i8( %pred, i8* %addr) + ret %res +} + +define @ld1rqb_i8_imm( %pred, i8* %addr) { +; CHECK-LABEL: ld1rqb_i8_imm: +; CHECK: ld1rqb { z0.b }, p0/z, [x0, #16] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i8 16 + %res = call @llvm.aarch64.sve.ld1rq.nxv16i8( %pred, i8* %ptr) + ret %res +} + +define @ld1rqb_i8_imm_lower_bound( %pred, i8* %addr) { +; CHECK-LABEL: ld1rqb_i8_imm_lower_bound: +; CHECK: ld1rqb { z0.b }, p0/z, [x0, #-128] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i8 -128 + %res = call @llvm.aarch64.sve.ld1rq.nxv16i8( %pred, i8* %ptr) + ret %res +} + +define @ld1rqb_i8_imm_upper_bound( %pred, i8* %addr) { +; CHECK-LABEL: ld1rqb_i8_imm_upper_bound: +; CHECK: ld1rqb { z0.b }, p0/z, [x0, #112] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i8 112 + %res = call @llvm.aarch64.sve.ld1rq.nxv16i8( %pred, i8* %ptr) + ret %res +} + +define @ld1rqb_i8_imm_out_of_lower_bound( %pred, i8* %addr) { +; CHECK-LABEL: ld1rqb_i8_imm_out_of_lower_bound: +; CHECK: sub x8, x0, #129 +; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x8] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 -129 + %res = call @llvm.aarch64.sve.ld1rq.nxv16i8( %pred, i8* %ptr) + ret %res +} + +define @ld1rqb_i8_imm_out_of_upper_bound( %pred, i8* %addr) { +; CHECK-LABEL: ld1rqb_i8_imm_out_of_upper_bound: +; CHECK: add x8, x0, #113 +; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x8] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i8, i8* %addr, i64 113 + %res = call @llvm.aarch64.sve.ld1rq.nxv16i8( %pred, i8* %ptr) + ret %res +} + +; +; LD1RQH +; + +define @ld1rqh_i16( %pred, i16* %addr) { +; CHECK-LABEL: ld1rqh_i16: +; CHECK: ld1rqh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv8i16( %pred, i16* %addr) + ret %res +} + +define @ld1rqh_f16( %pred, half* %addr) { +; CHECK-LABEL: ld1rqh_f16: +; CHECK: ld1rqh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv8f16( %pred, half* %addr) + ret %res +} + +define @ld1rqh_i16_imm( %pred, i16* %addr) { +; CHECK-LABEL: ld1rqh_i16_imm: +; CHECK: ld1rqh { z0.h }, p0/z, [x0, #-64] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i16, i16* %addr, i16 -32 + %res = call @llvm.aarch64.sve.ld1rq.nxv8i16( %pred, i16* %ptr) + ret %res +} + +define @ld1rqh_f16_imm( %pred, half* %addr) { +; CHECK-LABEL: ld1rqh_f16_imm: +; CHECK: ld1rqh { z0.h }, p0/z, [x0, #-16] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds half, half* %addr, i16 -8 + %res = call @llvm.aarch64.sve.ld1rq.nxv8f16( %pred, half* %ptr) + ret %res +} + +; +; LD1RQW +; + +define @ld1rqw_i32( %pred, i32* %addr) { +; CHECK-LABEL: ld1rqw_i32: +; CHECK: ld1rqw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv4i32( %pred, i32* %addr) + ret %res +} + +define @ld1rqw_f32( %pred, float* %addr) { +; CHECK-LABEL: ld1rqw_f32: +; CHECK: ld1rqw { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv4f32( %pred, float* %addr) + ret %res +} + +define @ld1rqw_i32_imm( %pred, i32* %addr) { +; CHECK-LABEL: ld1rqw_i32_imm: +; CHECK: ld1rqw { z0.s }, p0/z, [x0, #112] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i32, i32* %addr, i32 28 + %res = call @llvm.aarch64.sve.ld1rq.nxv4i32( %pred, i32* %ptr) + ret %res +} + +define @ld1rqw_f32_imm( %pred, float* %addr) { +; CHECK-LABEL: ld1rqw_f32_imm: +; CHECK: ld1rqw { z0.s }, p0/z, [x0, #32] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds float, float* %addr, i32 8 + %res = call @llvm.aarch64.sve.ld1rq.nxv4f32( %pred, float* %ptr) + ret %res +} + +; +; LD1RQD +; + +define @ld1rqd_i64( %pred, i64* %addr) { +; CHECK-LABEL: ld1rqd_i64: +; CHECK: ld1rqd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv2i64( %pred, i64* %addr) + ret %res +} + +define @ld1rqd_f64( %pred, double* %addr) { +; CHECK-LABEL: ld1rqd_f64: +; CHECK: ld1rqd { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv2f64( %pred, double* %addr) + ret %res +} + +define @ld1rqd_i64_imm( %pred, i64* %addr) { +; CHECK-LABEL: ld1rqd_i64_imm: +; CHECK: ld1rqd { z0.d }, p0/z, [x0, #64] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds i64, i64* %addr, i64 8 + %res = call @llvm.aarch64.sve.ld1rq.nxv2i64( %pred, i64* %ptr) + ret %res +} + +define @ld1rqd_f64_imm( %pred, double* %addr) { +; CHECK-LABEL: ld1rqd_f64_imm: +; CHECK: ld1rqd { z0.d }, p0/z, [x0, #-128] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds double, double* %addr, i64 -16 + %res = call @llvm.aarch64.sve.ld1rq.nxv2f64( %pred, double* %ptr) + ret %res +} + +; ; LDNT1B ; @@ -79,6 +252,14 @@ define @ldnt1d_f64( %pred, double* %addr) ret %res } +declare @llvm.aarch64.sve.ld1rq.nxv16i8(, i8*) +declare @llvm.aarch64.sve.ld1rq.nxv8i16(, i16*) +declare @llvm.aarch64.sve.ld1rq.nxv4i32(, i32*) +declare @llvm.aarch64.sve.ld1rq.nxv2i64(, i64*) +declare @llvm.aarch64.sve.ld1rq.nxv8f16(, half*) +declare @llvm.aarch64.sve.ld1rq.nxv4f32(, float*) +declare @llvm.aarch64.sve.ld1rq.nxv2f64(, double*) + declare @llvm.aarch64.sve.ldnt1.nxv16i8(, i8*) declare @llvm.aarch64.sve.ldnt1.nxv8i16(, i16*) declare @llvm.aarch64.sve.ldnt1.nxv4i32(, i32*) -- 2.7.4