[AArch64] use CNT for ISD::popcnt and ISD::parity if available

author Ties Stuij <ties.stuij@arm.com>

Thu, 1 Dec 2022 16:37:50 +0000 (16:37 +0000)

committer Ties Stuij <ties.stuij@arm.com>

Fri, 2 Dec 2022 11:27:14 +0000 (11:27 +0000)
author Ties Stuij <ties.stuij@arm.com>
Thu, 1 Dec 2022 16:37:50 +0000 (16:37 +0000)
committer Ties Stuij <ties.stuij@arm.com>
Fri, 2 Dec 2022 11:27:14 +0000 (11:27 +0000)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

index 5162f5e..184ea8e 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -541,12 +541,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
    setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
    setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
  
-  setOperationAction(ISD::CTPOP, MVT::i32, Custom);
-  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
-  setOperationAction(ISD::CTPOP, MVT::i128, Custom);
+  if (Subtarget->hasCSSC()) {
+    setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+    setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+    setOperationAction(ISD::CTPOP, MVT::i128, Expand);
+    setOperationAction(ISD::PARITY, MVT::i128, Expand);
+  } else {
+    setOperationAction(ISD::CTPOP, MVT::i32, Custom);
+    setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+    setOperationAction(ISD::CTPOP, MVT::i128, Custom);
  
-  setOperationAction(ISD::PARITY, MVT::i64, Custom);
-  setOperationAction(ISD::PARITY, MVT::i128, Custom);
+    setOperationAction(ISD::PARITY, MVT::i64, Custom);
+    setOperationAction(ISD::PARITY, MVT::i128, Custom);
+  }
  
    setOperationAction(ISD::ABS, MVT::i32, Custom);
    setOperationAction(ISD::ABS, MVT::i64, Custom);
@@ -8413,8 +8420,16 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
      return SDValue();
  
    bool IsParity = Op.getOpcode() == ISD::PARITY;
+  SDValue Val = Op.getOperand(0);
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
  
-  // While there is no integer popcount instruction, it can
+  // for i32, general parity function using EORs is more efficient compared to
+  // using floating point
+  if (VT == MVT::i32 && IsParity)
+    return SDValue();
+
+  // If there is no CNT instruction available, GPR popcount can
    // be more efficiently lowered to the following sequence that uses
    // AdvSIMD registers/instructions as long as the copies to/from
    // the AdvSIMD registers are cheap.
@@ -8422,10 +8437,6 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
    //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
    //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
    //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
-  SDValue Val = Op.getOperand(0);
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-
    if (VT == MVT::i32 || VT == MVT::i64) {
      if (VT == MVT::i32)
        Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td

index 427f5dc..9c0c374 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8529,7 +8529,7 @@ defm RCWSWP  : ReadCheckWriteOperation<0b010, "swp">;
  // General Data-Processing Instructions (FEAT_V94_DP)
  //===----------------------------------------------------------------------===//
  defm ABS : OneOperandData<0b001000, "abs">, Requires<[HasCSSC]>;
-defm CNT : OneOperandData<0b000111, "cnt">, Requires<[HasCSSC]>;
+defm CNT : OneOperandData<0b000111, "cnt", ctpop>, Requires<[HasCSSC]>;
  defm CTZ : OneOperandData<0b000110, "ctz">, Requires<[HasCSSC]>;
  
  defm SMAX : ComparisonOp<0, 0, "smax">, Requires<[HasCSSC]>;
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll

index 872829e..2a4b30a 100644 (file)
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -1,6 +1,7 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
  ; RUN: llc < %s -mtriple=aarch64-eabi -mattr -neon -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-NONEON %s
+; RUN: llc < %s -mtriple=aarch64-eabi -mattr +cssc -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-CSSC %s
  
  define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
  ; CHECK-LABEL: cnt32_advsimd:
@@ -27,6 +28,11 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
  ; CHECK-NONEON-NEXT:    mul w8, w9, w8
  ; CHECK-NONEON-NEXT:    lsr w0, w8, #24
  ; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: cnt32_advsimd:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt w0, w0
+; CHECK-CSSC-NEXT:    ret
    %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
    ret i32 %cnt
  }
@@ -57,6 +63,13 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) {
  ; CHECK-NONEON-NEXT:    mul w8, w9, w8
  ; CHECK-NONEON-NEXT:    lsr w0, w8, #24
  ; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: cnt32_advsimd_2:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-CSSC-NEXT:    fmov w8, s0
+; CHECK-CSSC-NEXT:    cnt w0, w8
+; CHECK-CSSC-NEXT:    ret
    %1 = extractelement <2 x i32> %x, i64 0
    %2 = tail call i32 @llvm.ctpop.i32(i32 %1)
    ret i32 %2
@@ -86,6 +99,11 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
  ; CHECK-NONEON-NEXT:    mul x8, x9, x8
  ; CHECK-NONEON-NEXT:    lsr x0, x8, #56
  ; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: cnt64_advsimd:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x0, x0
+; CHECK-CSSC-NEXT:    ret
    %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
    ret i64 %cnt
  }
@@ -125,6 +143,11 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
  ; CHECK-NONEON-NEXT:    mul w8, w9, w8
  ; CHECK-NONEON-NEXT:    lsr w0, w8, #24
  ; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: cnt32:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt w0, w0
+; CHECK-CSSC-NEXT:    ret
    %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
    ret i32 %cnt
  }
@@ -161,6 +184,11 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
  ; CHECK-NONEON-NEXT:    mul x8, x9, x8
  ; CHECK-NONEON-NEXT:    lsr x0, x8, #56
  ; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: cnt64:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x0, x0
+; CHECK-CSSC-NEXT:    ret
    %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
    ret i64 %cnt
  }
@@ -181,6 +209,13 @@ define i32 @ctpop_eq_one(i64 %x) nounwind readnone {
  ; CHECK-NONEON-NEXT:    ccmp x0, #0, #4, eq
  ; CHECK-NONEON-NEXT:    cset w0, ne
  ; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: ctpop_eq_one:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x8, x0
+; CHECK-CSSC-NEXT:    cmp x8, #1
+; CHECK-CSSC-NEXT:    cset w0, eq
+; CHECK-CSSC-NEXT:    ret
    %count = tail call i64 @llvm.ctpop.i64(i64 %x)
    %cmp = icmp eq i64 %count, 1
    %conv = zext i1 %cmp to i32
@@ -203,6 +238,13 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone {
  ; CHECK-NONEON-NEXT:    ccmp x0, #0, #4, eq
  ; CHECK-NONEON-NEXT:    cset w0, eq
  ; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: ctpop_ne_one:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x8, x0
+; CHECK-CSSC-NEXT:    cmp x8, #1
+; CHECK-CSSC-NEXT:    cset w0, ne
+; CHECK-CSSC-NEXT:    ret
    %count = tail call i64 @llvm.ctpop.i64(i64 %x)
    %cmp = icmp ne i64 %count, 1
    %conv = zext i1 %cmp to i32
diff --git a/llvm/test/CodeGen/AArch64/ctpop-nonean.ll b/llvm/test/CodeGen/AArch64/ctpop-nonean.ll

index 5de6a30..af5652a 100644 (file)
--- a/llvm/test/CodeGen/AArch64/ctpop-nonean.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop-nonean.ll
@@ -1,5 +1,6 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon -mattr=+cssc < %s | FileCheck %s -check-prefix=CHECK-CSSC
  
  declare i128 @llvm.ctpop.i128(i128)
  
@@ -31,6 +32,14 @@ define i128 @ctpop_i128(i128 %i) {
  ; CHECK-NEXT:    lsr x9, x9, #56
  ; CHECK-NEXT:    add x0, x9, x8, lsr #56
  ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: ctpop_i128:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x8, x1
+; CHECK-CSSC-NEXT:    cnt x9, x0
+; CHECK-CSSC-NEXT:    add x0, x9, x8
+; CHECK-CSSC-NEXT:    mov x1, xzr
+; CHECK-CSSC-NEXT:    ret
    %c = call i128 @llvm.ctpop.i128(i128 %i)
    ret i128 %c
  }
diff --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll

index 534892d..19dd185 100644 (file)
--- a/llvm/test/CodeGen/AArch64/parity.ll
+++ b/llvm/test/CodeGen/AArch64/parity.ll
@@ -1,5 +1,6 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu -mattr=+cssc | FileCheck %s -check-prefix=CHECK-CSSC
  
  define i4 @parity_4(i4 %x) {
  ; CHECK-LABEL: parity_4:
@@ -9,6 +10,13 @@ define i4 @parity_4(i4 %x) {
  ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
  ; CHECK-NEXT:    and w0, w8, #0x1
  ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_4:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    and w8, w0, #0xf
+; CHECK-CSSC-NEXT:    cnt w8, w8
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
    %1 = tail call i4 @llvm.ctpop.i4(i4 %x)
    %2 = and i4 %1, 1
    ret i4 %2
@@ -23,6 +31,13 @@ define i8 @parity_8(i8 %x) {
  ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
  ; CHECK-NEXT:    and w0, w8, #0x1
  ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_8:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    and w8, w0, #0xff
+; CHECK-CSSC-NEXT:    cnt w8, w8
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
    %1 = tail call i8 @llvm.ctpop.i8(i8 %x)
    %2 = and i8 %1, 1
    ret i8 %2
@@ -38,6 +53,13 @@ define i16 @parity_16(i16 %x) {
  ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
  ; CHECK-NEXT:    and w0, w8, #0x1
  ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_16:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    and w8, w0, #0xffff
+; CHECK-CSSC-NEXT:    cnt w8, w8
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
    %1 = tail call i16 @llvm.ctpop.i16(i16 %x)
    %2 = and i16 %1, 1
    ret i16 %2
@@ -54,6 +76,13 @@ define i17 @parity_17(i17 %x) {
  ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
  ; CHECK-NEXT:    and w0, w8, #0x1
  ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_17:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    and w8, w0, #0x1ffff
+; CHECK-CSSC-NEXT:    cnt w8, w8
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
    %1 = tail call i17 @llvm.ctpop.i17(i17 %x)
    %2 = and i17 %1, 1
    ret i17 %2
@@ -69,6 +98,12 @@ define i32 @parity_32(i32 %x) {
  ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
  ; CHECK-NEXT:    and w0, w8, #0x1
  ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_32:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt w8, w0
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
    %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
    %2 = and i32 %1, 1
    ret i32 %2
@@ -83,6 +118,12 @@ define i64 @parity_64(i64 %x) {
  ; CHECK-NEXT:    fmov w8, s0
  ; CHECK-NEXT:    and w0, w8, #0x1
  ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_64:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x8, x0
+; CHECK-CSSC-NEXT:    and x0, x8, #0x1
+; CHECK-CSSC-NEXT:    ret
    %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
    %2 = and i64 %1, 1
    ret i64 %2
@@ -99,6 +140,14 @@ define i128 @parity_128(i128 %x) {
  ; CHECK-NEXT:    fmov w8, s0
  ; CHECK-NEXT:    and w0, w8, #0x1
  ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_128:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    eor x8, x0, x1
+; CHECK-CSSC-NEXT:    mov x1, xzr
+; CHECK-CSSC-NEXT:    cnt x8, x8
+; CHECK-CSSC-NEXT:    and x0, x8, #0x1
+; CHECK-CSSC-NEXT:    ret
    %1 = tail call i128 @llvm.ctpop.i128(i128 %x)
    %2 = and i128 %1, 1
    ret i128 %2
@@ -113,6 +162,12 @@ define i32 @parity_64_trunc(i64 %x) {
  ; CHECK-NEXT:    fmov w8, s0
  ; CHECK-NEXT:    and w0, w8, #0x1
  ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_64_trunc:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x8, x0
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
    %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
    %2 = trunc i64 %1 to i32
    %3 = and i32 %2, 1
@@ -129,6 +184,12 @@ define i8 @parity_32_trunc(i32 %x) {
  ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
  ; CHECK-NEXT:    and w0, w8, #0x1
  ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_32_trunc:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt w8, w0
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
    %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
    %2 = trunc i32 %1 to i8
    %3 = and i8 %2, 1
@@ -144,6 +205,13 @@ define i32 @parity_8_zext(i8 %x) {
  ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
  ; CHECK-NEXT:    and w0, w8, #0x1
  ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_8_zext:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    and w8, w0, #0xff
+; CHECK-CSSC-NEXT:    cnt w8, w8
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
    %a = zext i8 %x to i32
    %b = tail call i32 @llvm.ctpop.i32(i32 %a)
    %c = and i32 %b, 1
@@ -159,6 +227,13 @@ define i32 @parity_8_mask(i32 %x) {
  ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
  ; CHECK-NEXT:    and w0, w8, #0x1
  ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_8_mask:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    and w8, w0, #0xff
+; CHECK-CSSC-NEXT:    cnt w8, w8
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
    %a = and i32 %x, 255
    %b = tail call i32 @llvm.ctpop.i32(i32 %a)
    %c = and i32 %b, 1
author	Ties Stuij <ties.stuij@arm.com>
	Thu, 1 Dec 2022 16:37:50 +0000 (16:37 +0000)
committer	Ties Stuij <ties.stuij@arm.com>
	Fri, 2 Dec 2022 11:27:14 +0000 (11:27 +0000)
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64InstrInfo.td		patch \| blob \| history
llvm/test/CodeGen/AArch64/arm64-popcnt.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/ctpop-nonean.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/parity.ll		patch \| blob \| history