From 661403b85c219a83baa37335a870d4d93dc4b1c3 Mon Sep 17 00:00:00 2001 From: Zain Jaffal Date: Fri, 30 Sep 2022 11:03:48 +0100 Subject: [PATCH] [AArch64] Add support for 128-bit non temporal loads. Adding to the work done in `D131773` here we add support to 128-bit loads. Reviewed By: fhahn Differential Revision: https://reviews.llvm.org/D132559 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 31 +++++++++++++++++++++++-- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 1 + llvm/lib/Target/AArch64/AArch64InstrInfo.td | 5 ++++ llvm/test/CodeGen/AArch64/nontemporal-load.ll | 15 ++++++++---- 4 files changed, 45 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 692a254..5d0866e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -799,6 +799,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v8f32, Custom); setOperationAction(ISD::LOAD, MVT::v4f64, Custom); setOperationAction(ISD::LOAD, MVT::v4i64, Custom); + // 128-bit non-temporal loads can be lowered to LDNP using custom lowering. + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i64, Custom); + setOperationAction(ISD::LOAD, MVT::v8i16, Custom); + setOperationAction(ISD::LOAD, MVT::v16i8, Custom); // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. @@ -2330,6 +2335,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED) MAKE_CASE(AArch64ISD::LDP) MAKE_CASE(AArch64ISD::LDNP) + MAKE_CASE(AArch64ISD::LDNP128) MAKE_CASE(AArch64ISD::STP) MAKE_CASE(AArch64ISD::STNP) MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU) @@ -5414,6 +5420,27 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, SDLoc DL(Op); LoadSDNode *LoadNode = cast(Op); assert(LoadNode && "Expected custom lowering of a load node"); + // Handle lowering 128-bit non temporal loads for little-endian targets. + EVT MemVT = LoadNode->getMemoryVT(); + if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() && + MemVT.getSizeInBits() == 128 && + (MemVT.getScalarSizeInBits() == 8u || + MemVT.getScalarSizeInBits() == 16u || + MemVT.getScalarSizeInBits() == 32u || + MemVT.getScalarSizeInBits() == 64u)) { + + SDValue Result = DAG.getMemIntrinsicNode( + AArch64ISD::LDNP128, DL, + DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + MVT::Other}), + {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(), + LoadNode->getMemOperand()); + + SDValue P = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), MemVT, + Result.getValue(0), Result.getValue(1)); + return DAG.getMergeValues({P, Result.getValue(2) /* Chain */}, DL); + } if (LoadNode->getMemoryVT() == MVT::i64x8) { SmallVector Ops; @@ -5435,9 +5462,9 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, // Custom lowering for extending v4i8 vector loads. EVT VT = Op->getValueType(0); - assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32"); - if (LoadNode->getMemoryVT() != MVT::v4i8) + if ((VT != MVT::v4i16 && VT != MVT::v4i32) || + LoadNode->getMemoryVT() != MVT::v4i8) return SDValue(); unsigned ExtType; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 0a98d0b..bf5bce7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -457,6 +457,7 @@ enum NodeType : unsigned { LDP, LDNP, + LDNP128, STP, STNP, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 7695080..2b2700d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -319,6 +319,7 @@ def SDT_AArch64uaddlp : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64ldnp128 : SDTypeProfile<2, 1, [SDTCisVT<0, v2i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; @@ -732,6 +733,7 @@ def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>; def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def AArch64ldnp : SDNode<"AArch64ISD::LDNP", SDT_AArch64ldnp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def AArch64ldnp128 : SDNode<"AArch64ISD::LDNP128", SDT_AArch64ldnp128, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -2592,6 +2594,9 @@ def : Pat<(AArch64ldp (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), def : Pat<(AArch64ldnp (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)), (LDNPQi GPR64sp:$Rn, simm7s16:$offset)>; + +def : Pat<(AArch64ldnp128 (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), + (LDNPDi GPR64sp:$Rn, simm7s8:$offset)>; //--- // (register offset) //--- diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll index 288ba22..12f1de0 100644 --- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll @@ -103,7 +103,8 @@ define <32 x i8> @test_ldnp_v32i8(<32 x i8>* %A) { define <4 x i32> @test_ldnp_v4i32(<4 x i32>* %A) { ; CHECK-LABEL: test_ldnp_v4i32: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldnp d0, d1, [x0] +; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v4i32: @@ -117,7 +118,8 @@ define <4 x i32> @test_ldnp_v4i32(<4 x i32>* %A) { define <4 x float> @test_ldnp_v4f32(<4 x float>* %A) { ; CHECK-LABEL: test_ldnp_v4f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldnp d0, d1, [x0] +; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v4f32: @@ -131,7 +133,8 @@ define <4 x float> @test_ldnp_v4f32(<4 x float>* %A) { define <8 x i16> @test_ldnp_v8i16(<8 x i16>* %A) { ; CHECK-LABEL: test_ldnp_v8i16: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldnp d0, d1, [x0] +; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v8i16: @@ -145,7 +148,8 @@ define <8 x i16> @test_ldnp_v8i16(<8 x i16>* %A) { define <16 x i8> @test_ldnp_v16i8(<16 x i8>* %A) { ; CHECK-LABEL: test_ldnp_v16i8: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldnp d0, d1, [x0] +; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v16i8: @@ -158,7 +162,8 @@ define <16 x i8> @test_ldnp_v16i8(<16 x i8>* %A) { define <2 x double> @test_ldnp_v2f64(<2 x double>* %A) { ; CHECK-LABEL: test_ldnp_v2f64: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldnp d0, d1, [x0] +; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v2f64: -- 2.7.4