From 338fd211e7c36d7a2e0eee4c3530ed656c893a9d Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 11 Jun 2022 14:16:37 +0100 Subject: [PATCH] [AArch64] Generate FADDP from shuffled fadd As a follow up to D126686, this does the same fold for floating point add and shuffle. In this case it is limited to reassoc either x[0]+x[1] or x[1]+x[0] for both result[0] and results[1]. Differential Revision: https://reviews.llvm.org/D127087 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 14 ++++++++++--- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 5 ++++- llvm/test/CodeGen/AArch64/faddp-half.ll | 7 +++---- llvm/test/CodeGen/AArch64/faddp.ll | 28 +++++++++++-------------- 4 files changed, 30 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index f75c5e8..c855d12 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1071,6 +1071,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // ADDP custom lowering for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) setOperationAction(ISD::ADD, VT, Custom); + // FADDP custom lowering + for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 }) + setOperationAction(ISD::FADD, VT, Custom); } if (Subtarget->hasSVE()) { @@ -19317,9 +19320,13 @@ void AArch64TargetLowering::ReplaceBITCASTResults( } static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl &Results, - SelectionDAG &DAG) { + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { EVT VT = N->getValueType(0); - if (!VT.is256BitVector()) + if (!VT.is256BitVector() || + (VT.getScalarType().isFloatingPoint() && + !N->getFlags().hasAllowReassociation()) || + (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16())) return; SDValue X = N->getOperand(0); @@ -19537,7 +19544,8 @@ void AArch64TargetLowering::ReplaceNodeResults( Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; case ISD::ADD: - ReplaceAddWithADDP(N, Results, DAG); + case ISD::FADD: + ReplaceAddWithADDP(N, Results, DAG, Subtarget); return; case ISD::CTPOP: diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 00c0d74..65ca797 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -706,6 +706,9 @@ def AArch64uaddlp : PatFrags<(ops node:$src), def AArch64saddlp : PatFrags<(ops node:$src), [(AArch64saddlp_n node:$src), (int_aarch64_neon_saddlp node:$src)]>; +def AArch64faddp : PatFrags<(ops node:$Rn, node:$Rm), + [(AArch64addp_n node:$Rn, node:$Rm), + (int_aarch64_neon_faddp node:$Rn, node:$Rm)]>; def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -4535,7 +4538,7 @@ def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast("FABD"#VT) VT:$Rn, V } defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>; defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>; -defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>; +defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp", AArch64faddp>; defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", any_fadd>; defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; diff --git a/llvm/test/CodeGen/AArch64/faddp-half.ll b/llvm/test/CodeGen/AArch64/faddp-half.ll index 8413ab0..4cb9832 100644 --- a/llvm/test/CodeGen/AArch64/faddp-half.ll +++ b/llvm/test/CodeGen/AArch64/faddp-half.ll @@ -214,10 +214,9 @@ entry: define <16 x half> @addp_v16f16(<16 x half> %a) { ; CHECK-LABEL: addp_v16f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev32 v2.8h, v0.8h -; CHECK-NEXT: rev32 v3.8h, v1.8h -; CHECK-NEXT: fadd v0.8h, v2.8h, v0.8h -; CHECK-NEXT: fadd v1.8h, v3.8h, v1.8h +; CHECK-NEXT: faddp v1.8h, v0.8h, v1.8h +; CHECK-NEXT: zip1 v0.8h, v1.8h, v1.8h +; CHECK-NEXT: zip2 v1.8h, v1.8h, v1.8h ; CHECK-NEXT: ret ; ; CHECKNOFP16-LABEL: addp_v16f16: diff --git a/llvm/test/CodeGen/AArch64/faddp.ll b/llvm/test/CodeGen/AArch64/faddp.ll index 3802b2f..2da2e8f 100644 --- a/llvm/test/CodeGen/AArch64/faddp.ll +++ b/llvm/test/CodeGen/AArch64/faddp.ll @@ -191,10 +191,9 @@ entry: define <4 x double> @addp_v4f64(<4 x double> %a) { ; CHECK-LABEL: addp_v4f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: fadd v0.2d, v2.2d, v0.2d -; CHECK-NEXT: fadd v1.2d, v3.2d, v1.2d +; CHECK-NEXT: faddp v1.2d, v0.2d, v1.2d +; CHECK-NEXT: dup v0.2d, v1.d[0] +; CHECK-NEXT: dup v1.2d, v1.d[1] ; CHECK-NEXT: ret entry: %s = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> @@ -231,10 +230,9 @@ entry: define <8 x float> @addp_v8f32_slow(<8 x float> %a) { ; CHECK-LABEL: addp_v8f32_slow: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev64 v2.4s, v0.4s -; CHECK-NEXT: rev64 v3.4s, v1.4s -; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s -; CHECK-NEXT: fadd v1.4s, v3.4s, v1.4s +; CHECK-NEXT: faddp v1.4s, v0.4s, v1.4s +; CHECK-NEXT: zip1 v0.4s, v1.4s, v1.4s +; CHECK-NEXT: zip2 v1.4s, v1.4s, v1.4s ; CHECK-NEXT: ret entry: %s = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> @@ -245,14 +243,12 @@ entry: define <16 x float> @addp_v16f32(<16 x float> %a) { ; CHECK-LABEL: addp_v16f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev64 v4.4s, v0.4s -; CHECK-NEXT: rev64 v5.4s, v1.4s -; CHECK-NEXT: rev64 v6.4s, v2.4s -; CHECK-NEXT: rev64 v7.4s, v3.4s -; CHECK-NEXT: fadd v0.4s, v4.4s, v0.4s -; CHECK-NEXT: fadd v1.4s, v5.4s, v1.4s -; CHECK-NEXT: fadd v2.4s, v6.4s, v2.4s -; CHECK-NEXT: fadd v3.4s, v7.4s, v3.4s +; CHECK-NEXT: faddp v3.4s, v2.4s, v3.4s +; CHECK-NEXT: faddp v1.4s, v0.4s, v1.4s +; CHECK-NEXT: zip1 v2.4s, v3.4s, v3.4s +; CHECK-NEXT: zip1 v0.4s, v1.4s, v1.4s +; CHECK-NEXT: zip2 v1.4s, v1.4s, v1.4s +; CHECK-NEXT: zip2 v3.4s, v3.4s, v3.4s ; CHECK-NEXT: ret entry: %s = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> -- 2.7.4