From 18149218939774b5eac228adc5c3958fbaac9028 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Wed, 16 Nov 2022 10:45:48 +0000
Subject: [PATCH] [AArch64][CodeGen] Remove redundant vector negations before
 concat

This adds a new canonicalization rule to replace concats of truncated
negations with a negation of the concatenated truncates, e.g.

    (concat_vectors (v4i16 (truncate (not (v4i32)))),
                    (v4i16 (truncate (not (v4i32)))))
   ->
    (not (concat_vectors (v4i16 (truncate (v4i32))),
                         (v4i16 (truncate (v4i32)))))

Doing this allows avoiding redundant negations being emitted in
certain cases.

Reviewed By: peterwaller-arm

Differential Revision: https://reviews.llvm.org/D137433
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp    | 31 +++++++++-
 .../illegal-floating-point-vector-compares.ll      | 35 ++++-------
 .../pull-negations-after-concat-of-truncates.ll    | 68 ++++++++++++++++++++++
 3 files changed, 110 insertions(+), 24 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index da1d214..b06f630 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16229,8 +16229,37 @@ static SDValue performConcatVectorsCombine(SDNode *N,
     }
   }
 
+  // Canonicalise concat_vectors to replace concatenations of truncated nots
+  // with nots of concatenated truncates. This in some cases allows for multiple
+  // redundant negations to be eliminated.
+  //  (concat_vectors (v4i16 (truncate (not (v4i32)))),
+  //                  (v4i16 (truncate (not (v4i32)))))
+  // ->
+  //  (not (concat_vectors (v4i16 (truncate (v4i32))),
+  //                       (v4i16 (truncate (v4i32)))))
+  if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
+      N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
+      N->isOnlyUserOf(N1.getNode())) {
+    auto isBitwiseVectorNegate = [](SDValue V) {
+      return V->getOpcode() == ISD::XOR &&
+             ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
+    };
+    SDValue N00 = N0->getOperand(0);
+    SDValue N10 = N1->getOperand(0);
+    if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
+        isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
+      return DAG.getNOT(
+          dl,
+          DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                      DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
+                                  N00->getOperand(0)),
+                      DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
+                                  N10->getOperand(0))),
+          VT);
+    }
+  }
 
-  // Wait 'til after everything is legalized to try this. That way we have
+  // Wait till after everything is legalized to try this. That way we have
   // legal vector types and such.
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
index 6d2f75b..53aea45 100644
--- a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
+++ b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
@@ -10,9 +10,8 @@ define i1 @unordered_floating_point_compare_on_v8f32(<8 x float> %a_vec) {
 ; CHECK-NEXT:    fcmgt v1.4s, v1.4s, #0.0
 ; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    fcmgt v0.4s, v0.4s, #0.0
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    umaxv b0, v0.8b
 ; CHECK-NEXT:    fmov w9, s0
@@ -32,13 +31,10 @@ define i1 @unordered_floating_point_compare_on_v16f32(<16 x float> %a_vec) {
 ; CHECK-NEXT:    fcmgt v2.4s, v2.4s, #0.0
 ; CHECK-NEXT:    fcmgt v1.4s, v1.4s, #0.0
 ; CHECK-NEXT:    fcmgt v0.4s, v0.4s, #0.0
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    umaxv b0, v0.16b
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    bic w0, w8, w9
@@ -52,30 +48,23 @@ define i1 @unordered_floating_point_compare_on_v16f32(<16 x float> %a_vec) {
 define i1 @unordered_floating_point_compare_on_v32f32(<32 x float> %a_vec) {
 ; CHECK-LABEL: unordered_floating_point_compare_on_v32f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcmgt v7.4s, v7.4s, #0.0
-; CHECK-NEXT:    mov w9, #1
-; CHECK-NEXT:    fcmgt v6.4s, v6.4s, #0.0
-; CHECK-NEXT:    fcmgt v5.4s, v5.4s, #0.0
-; CHECK-NEXT:    fcmgt v4.4s, v4.4s, #0.0
 ; CHECK-NEXT:    fcmgt v3.4s, v3.4s, #0.0
+; CHECK-NEXT:    mov w9, #1
 ; CHECK-NEXT:    fcmgt v2.4s, v2.4s, #0.0
 ; CHECK-NEXT:    fcmgt v1.4s, v1.4s, #0.0
 ; CHECK-NEXT:    fcmgt v0.4s, v0.4s, #0.0
-; CHECK-NEXT:    mvn v7.16b, v7.16b
-; CHECK-NEXT:    mvn v6.16b, v6.16b
-; CHECK-NEXT:    mvn v5.16b, v5.16b
-; CHECK-NEXT:    mvn v4.16b, v4.16b
-; CHECK-NEXT:    mvn v3.16b, v3.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    uzp1 v6.8h, v6.8h, v7.8h
-; CHECK-NEXT:    uzp1 v4.8h, v4.8h, v5.8h
+; CHECK-NEXT:    fcmgt v7.4s, v7.4s, #0.0
+; CHECK-NEXT:    fcmgt v6.4s, v6.4s, #0.0
+; CHECK-NEXT:    fcmgt v5.4s, v5.4s, #0.0
+; CHECK-NEXT:    fcmgt v4.4s, v4.4s, #0.0
 ; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    uzp1 v1.16b, v4.16b, v6.16b
+; CHECK-NEXT:    uzp1 v6.8h, v6.8h, v7.8h
+; CHECK-NEXT:    uzp1 v1.8h, v4.8h, v5.8h
 ; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    uzp1 v1.16b, v1.16b, v6.16b
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    orn v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    umaxv b0, v0.16b
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    bic w0, w9, w8
diff --git a/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll b/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll
new file mode 100644
index 0000000..7fc83c7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+define <8 x i16> @not_not_trunc_concat(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: not_not_trunc_concat:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    ret
+  %notx = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %trnx = trunc <4 x i32> %notx to <4 x i16>
+  %noty = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %trny = trunc <4 x i32> %noty to <4 x i16>
+  %r = shufflevector <4 x i16> %trnx, <4 x i16> %trny, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %r
+}
+
+; Chains of concat -> truncate -> negate should flatten out to a single negate.
+define <16 x i8> @not_not_trunc_concat_chain(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: not_not_trunc_concat_chain:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    ret
+  %nota = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %trna = trunc <4 x i32> %nota to <4 x i16>
+  %notb = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %trnb = trunc <4 x i32> %notb to <4 x i16>
+  %concat_a = shufflevector <4 x i16> %trna, <4 x i16> %trnb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %trun_concat_a = trunc <8 x i16> %concat_a to <8 x i8>
+  %notx = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %trnx = trunc <4 x i32> %notx to <4 x i16>
+  %noty = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %trny = trunc <4 x i32> %noty to <4 x i16>
+  %concat_b = shufflevector <4 x i16> %trnx, <4 x i16> %trny, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %trun_concat_b = trunc <8 x i16> %concat_b to <8 x i8>
+  %r = shufflevector <8 x i8> %trun_concat_a, <8 x i8> %trun_concat_b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %r
+}
+
+; Combine should not fire here, otherwise slightly worse code will be emitted.
+define <8 x i16> @not_not_trunc_concat_multiple_uses(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: not_not_trunc_concat_multiple_uses:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    xtn v1.4h, v1.4s
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    mvn v1.8b, v1.8b
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    mov v2.d[1], v1.d[0]
+; CHECK-NEXT:    mov v0.d[1], v0.d[0]
+; CHECK-NEXT:    add v0.8h, v2.8h, v0.8h
+; CHECK-NEXT:    ret
+  %notx = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %trnx = trunc <4 x i32> %notx to <4 x i16>
+  %noty = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %trny = trunc <4 x i32> %noty to <4 x i16>
+  %concat = shufflevector <4 x i16> %trnx, <4 x i16> %trny, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %add = add <4 x i16> %trnx, %trny
+  %extend_add = shufflevector <4 x i16> %add, <4 x i16> %add, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r = add <8 x i16> %concat, %extend_add
+  ret <8 x i16> %r
+
+}
+
-- 
2.7.4