From 9ec57cce6240bcdef599cc8a98590193f16b2d3e Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Sun, 7 Jun 2020 18:56:17 +0400 Subject: [PATCH] [AArch64] custom lowering for i128 popcount halves the number of CNT instructions generated --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 13 +++++ llvm/test/CodeGen/AArch64/popcount.ll | 78 +++++++++++++------------ 2 files changed, 55 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8671209..acfd77d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -370,6 +370,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i64, Custom); + setOperationAction(ISD::CTPOP, MVT::i128, Custom); setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); @@ -5423,6 +5424,15 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::i64) UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); return UaddLV; + } else if (VT == MVT::i128) { + Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val); + + SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val); + SDValue UaddLV = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); + + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV); } assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || @@ -14105,6 +14115,9 @@ void AArch64TargetLowering::ReplaceNodeResults( Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; + case ISD::CTPOP: + Results.push_back(LowerCTPOP(SDValue(N, 0), DAG)); + return; case AArch64ISD::SADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); return; diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll index fc71ed0..1059697 100644 --- a/llvm/test/CodeGen/AArch64/popcount.ll +++ b/llvm/test/CodeGen/AArch64/popcount.ll @@ -1,18 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s +; RUN: llc < %s -O0 -mtriple=aarch64-unknown-unknown | FileCheck %s ; Function Attrs: nobuiltin nounwind readonly define i8 @popcount128(i128* nocapture nonnull readonly %0) { ; CHECK-LABEL: popcount128: ; CHECK: // %bb.0: // %Entry -; CHECK-NEXT: ldp d1, d0, [x0] -; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: cnt v1.8b, v1.8b -; CHECK-NEXT: uaddlv h0, v0.8b -; CHECK-NEXT: uaddlv h1, v1.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: add w0, w9, w8 +; CHECK-NEXT: ldr x8, [x0, #8] +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: cnt v0.16b, v1.16b +; CHECK-NEXT: uaddlv h0, v0.16b +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: fmov w0, s1 ; CHECK-NEXT: ret Entry: %1 = load i128, i128* %0, align 16 @@ -28,23 +30,27 @@ declare i128 @llvm.ctpop.i128(i128) define i16 @popcount256(i256* nocapture nonnull readonly %0) { ; CHECK-LABEL: popcount256: ; CHECK: // %bb.0: // %Entry -; CHECK-NEXT: ldp d1, d0, [x0, #16] -; CHECK-NEXT: ldp d3, d2, [x0] -; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: cnt v1.8b, v1.8b -; CHECK-NEXT: uaddlv h0, v0.8b -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h1, v1.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: cnt v0.8b, v3.8b -; CHECK-NEXT: uaddlv h2, v2.8b +; CHECK-NEXT: ldr x8, [x0, #8] +; CHECK-NEXT: ldr x9, [x0, #24] +; CHECK-NEXT: ldr d0, [x0, #16] +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.d[1], x9 +; CHECK-NEXT: cnt v0.16b, v1.16b +; CHECK-NEXT: uaddlv h0, v0.16b +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: uaddlv h0, v0.8b -; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: add w9, w9, w10 -; CHECK-NEXT: add w0, w9, w8 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: cnt v0.16b, v1.16b +; CHECK-NEXT: uaddlv h0, v0.16b +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret Entry: %1 = load i256, i256* %0, align 16 @@ -59,17 +65,17 @@ declare i256 @llvm.ctpop.i256(i256) define <1 x i128> @popcount1x128(<1 x i128> %0) { ; CHECK-LABEL: popcount1x128: ; CHECK: // %bb.0: // %Entry -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: fmov d1, x0 -; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: cnt v1.8b, v1.8b -; CHECK-NEXT: uaddlv h0, v0.8b -; CHECK-NEXT: uaddlv h1, v1.8b -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: add x0, x9, x8 -; CHECK-NEXT: mov x1, v2.d[1] +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: mov v0.d[1], x1 +; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: uaddlv h0, v0.16b +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: fmov w0, s1 +; CHECK-NEXT: // kill: def $x0 killed $w0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov x1, v0.d[1] ; CHECK-NEXT: ret Entry: %1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0) -- 2.7.4