From 90ecb862a003d581136842dcdc213315727d50e2 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <sjoerd.meijer@arm.com>
Date: Tue, 16 Mar 2021 11:53:43 +0000
Subject: [PATCH] [AArch64] Rewrite (add, csel) to cinc

Don't rewrite an add instruction with 2 SET_CC operands into a csel
instruction. The total instruction sequence uses an extra instruction and
register. Preventing this allows us to match a `(add, csel)` pattern and
rewrite this into a `cinc`.

Differential Revision: https://reviews.llvm.org/D98704
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp |  7 +++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.td     |  5 +++
 llvm/test/CodeGen/AArch64/arm64-csel.ll         | 41 +++++++++++++++++++++++++
 llvm/test/CodeGen/AArch64/half.ll               | 12 ++++----
 4 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3c823f5..e3c928e1b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13190,6 +13190,13 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
   SDValue RHS = Op->getOperand(1);
   SetCCInfoAndKind InfoAndKind;
 
+  // If both operands are a SET_CC, then we don't want to perform this
+  // folding and create another csel as this results in more instructions
+  // (and higher register usage).
+  if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
+      isSetCCOrZExtSetCC(RHS, InfoAndKind))
+    return SDValue();
+
   // If neither operand is a SET_CC, give up.
   if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
     std::swap(LHS, RHS);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d5dd0ae..338963f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -2162,6 +2162,11 @@ def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV),
 def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV),
           (CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
 
+def : Pat<(add GPR32:$val, (AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV)),
+          (CSINCWr GPR32:$val, GPR32:$val, (i32 imm:$cc))>;
+def : Pat<(add GPR64:$val, (zext (AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV))),
+          (CSINCXr GPR64:$val, GPR64:$val, (i32 imm:$cc))>;
+
 // The inverse of the condition code from the alias instruction is what is used
 // in the aliased instruction. The parser all ready inverts the condition code
 // for these aliases.
diff --git a/llvm/test/CodeGen/AArch64/arm64-csel.ll b/llvm/test/CodeGen/AArch64/arm64-csel.ll
index f031710..44e951e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-csel.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-csel.ll
@@ -269,3 +269,44 @@ define i64 @foo23(i64 %x) {
   %res = select i1 %cmp, i64 1, i64 6
   ret i64 %res
 }
+
+define i16 @foo24(i8* nocapture readonly %A, i8* nocapture readonly %B) {
+; CHECK-LABEL: foo24:
+; CHECK:       ldrb    w[[W8:[0-9]+]], [x1]
+; CHECK-NEXT:  ldrb    w[[W9:[0-9]+]], [x0]
+; CHECK-NEXT:  cmp     w[[W8]], #33
+; CHECK-NEXT:  cset    w[[W8]], hi
+; CHECK-NEXT:  cmp     w[[W9]], #3
+; CHECK-NEXT:  cinc    w0, w[[W8]], hi
+; CHECK-NEXT:  ret
+entry:
+  %0 = load i8, i8* %A, align 1
+  %cmp = icmp ugt i8 %0, 3
+  %conv1 = zext i1 %cmp to i16
+  %1 = load i8, i8* %B, align 1
+  %cmp4 = icmp ugt i8 %1, 33
+  %conv5 = zext i1 %cmp4 to i16
+  %add = add nuw nsw i16 %conv5, %conv1
+  ret i16 %add
+}
+
+define i64 @foo25(i64* nocapture readonly %A, i64* nocapture readonly %B) {
+; CHECK-LABEL: foo25:
+; CHECK:       ldr    x[[X8:[0-9]+]], [x1]
+; CHECK-NEXT:  ldr    x[[X9:[0-9]+]], [x0]
+; CHECK-NEXT:  cmp    x[[X8]], #33
+; CHECK-NEXT:  cset   w[[W8]], hi
+; CHECK-NEXT:  cmp    x[[X9]], #3
+; CHECK-NEXT:  cinc   x0, x[[X8]], hi
+; CHECK-NEXT:  ret
+entry:
+  %0 = load i64, i64* %A, align 1
+  %cmp = icmp ugt i64 %0, 3
+  %conv1 = zext i1 %cmp to i64
+  %1 = load i64, i64* %B, align 1
+  %cmp4 = icmp ugt i64 %1, 33
+  %conv5 = zext i1 %cmp4 to i64
+  %add = add nuw nsw i64 %conv5, %conv1
+  ret i64 %add
+}
+
diff --git a/llvm/test/CodeGen/AArch64/half.ll b/llvm/test/CodeGen/AArch64/half.ll
index b815c53..ab64cc0 100644
--- a/llvm/test/CodeGen/AArch64/half.ll
+++ b/llvm/test/CodeGen/AArch64/half.ll
@@ -107,12 +107,12 @@ define i16 @test_fccmp(i1 %a, i16 %in) {
 ; CHECK-NEXT:    movk w9, #15428, lsl #16
 ; CHECK-NEXT:    fmov s1, w8
 ; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    cset w8, pl
-; CHECK-NEXT:    fccmp s0, s1, #8, pl
-; CHECK-NEXT:    mov w9, #4
-; CHECK-NEXT:    csinc w9, w9, wzr, mi
-; CHECK-NEXT:    add w0, w8, w9
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    mov w10, #4
+; CHECK-NEXT:    fccmp s0, s2, #8, pl
+; CHECK-NEXT:    csinc w8, w10, wzr, mi
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cinc w0, w8, pl
 ; CHECK-NEXT:    ret
   %f16 = bitcast i16 %in to half
   %cmp0 = fcmp ogt half 0xH3333, %f16
-- 
2.7.4