From 28de5f99bbe10d8982500abf5df24a80ab1bae79 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 16 Jan 2023 10:44:38 +0000
Subject: [PATCH] [AArch64] Sink to umull if we know tops bits are zero.

This is an extension to the code for sinking splats to multiplies, where
if we can detect that the top bits are known-zero we can treat the
instruction like a zext. The existing code was also adjusted in the
process to make it more precise about only sinking if both operands are
zext or sext.

Differential Revision: https://reviews.llvm.org/D141275
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp    | 38 ++++++++++++++++++----
 .../CodeGen/AArch64/aarch64-matrix-umull-smull.ll  | 29 +++++++----------
 .../AArch64/sink-free-instructions.ll              |  8 ++---
 3 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 476a3ef..93aac68 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -14016,12 +14017,20 @@ bool AArch64TargetLowering::shouldSinkOperands(
     return true;
   }
   case Instruction::Mul: {
-    bool IsProfitable = false;
+    int NumZExts = 0, NumSExts = 0;
     for (auto &Op : I->operands()) {
       // Make sure we are not already sinking this operand
       if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
         continue;
 
+      if (match(&Op, m_SExt(m_Value()))) {
+        NumSExts++;
+        continue;
+      } else if (match(&Op, m_ZExt(m_Value()))) {
+        NumZExts++;
+        continue;
+      }
+
       ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
 
       // If the Shuffle is a splat and the operand is a zext/sext, sinking the
@@ -14031,11 +14040,14 @@ bool AArch64TargetLowering::shouldSinkOperands(
           match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
         Ops.push_back(&Shuffle->getOperandUse(0));
         Ops.push_back(&Op);
-        IsProfitable = true;
+        if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
+          NumSExts++;
+        else
+          NumZExts++;
         continue;
       }
 
-      if (!Shuffle || !Shuffle->isZeroEltSplat())
+      if (!Shuffle)
         continue;
 
       Value *ShuffleOperand = Shuffle->getOperand(0);
@@ -14054,15 +14066,27 @@ bool AArch64TargetLowering::shouldSinkOperands(
         continue;
 
       unsigned Opcode = OperandInstr->getOpcode();
-      if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt)
-        continue;
+      if (Opcode == Instruction::SExt)
+        NumSExts++;
+      else if (Opcode == Instruction::ZExt)
+        NumZExts++;
+      else {
+        // If we find that the top bits are known 0, then we can sink and allow
+        // the backend to generate a umull.
+        unsigned Bitwidth = I->getType()->getScalarSizeInBits();
+        APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
+        const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
+        if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
+          continue;
+        NumZExts++;
+      }
 
       Ops.push_back(&Shuffle->getOperandUse(0));
       Ops.push_back(&Op);
-      IsProfitable = true;
     }
 
-    return IsProfitable;
+    // Is it profitable to sink if we found two of the same type of extends.
+    return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
   }
   default:
     return false;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 827f0ea..49a1552 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -427,13 +427,12 @@ define i16 @red_mla_dup_ext_u8_s8_s16(i8* noalias nocapture noundef readonly %A,
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB5_4: // %vector.ph
-; CHECK-NEXT:    dup v2.8b, w9
 ; CHECK-NEXT:    and x11, x10, #0xfffffff0
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    add x8, x0, #8
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    mov x12, x11
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    dup v2.8h, w9
 ; CHECK-NEXT:  .LBB5_5: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp d3, d4, [x8, #-8]
@@ -704,9 +703,8 @@ define void @matrix_mul_unsigned_and(i32 %N, i32* nocapture %C, i16* nocapture r
 ; CHECK:       // %bb.0: // %vector.header
 ; CHECK-NEXT:    and w8, w3, #0xffff
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    dup v0.4h, w8
 ; CHECK-NEXT:    and x8, x0, #0xfffffff8
-; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:  .LBB10_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x2, w0, uxtw #1
@@ -767,27 +765,24 @@ for.end12:                                        ; preds = %vector.body
 define void @matrix_mul_unsigned_and_double(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) {
 ; CHECK-LABEL: matrix_mul_unsigned_and_double:
 ; CHECK:       // %bb.0: // %vector.header
-; CHECK-NEXT:    and w8, w3, #0xffff
+; CHECK-NEXT:    and w9, w3, #0xffff
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    dup v0.4s, w8
 ; CHECK-NEXT:    and x8, x0, #0xfffffff0
-; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    dup v0.8h, w9
 ; CHECK-NEXT:  .LBB11_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x2, w0, uxtw #1
+; CHECK-NEXT:    add x10, x1, w0, uxtw #2
 ; CHECK-NEXT:    subs x8, x8, #16
+; CHECK-NEXT:    add w0, w0, #16
 ; CHECK-NEXT:    ldr q1, [x9]
 ; CHECK-NEXT:    ldur q2, [x9, #8]
-; CHECK-NEXT:    add x9, x1, w0, uxtw #2
-; CHECK-NEXT:    add w0, w0, #16
-; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    umull2 v3.4s, v0.8h, v1.8h
 ; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-NEXT:    umull2 v4.4s, v0.8h, v2.8h
 ; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    umull v4.4s, v0.4h, v4.4h
-; CHECK-NEXT:    stp q1, q3, [x9]
-; CHECK-NEXT:    stp q2, q4, [x9, #32]
+; CHECK-NEXT:    stp q1, q3, [x10]
+; CHECK-NEXT:    stp q2, q4, [x10, #32]
 ; CHECK-NEXT:    b.ne .LBB11_1
 ; CHECK-NEXT:  // %bb.2: // %for.end12
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
index acba833..f29054b 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
@@ -313,12 +313,12 @@ define <4 x i32> @sink_insertelement(i16 %e, i8 %f) {
 ; CHECK-NEXT:  for.cond4.preheader.lr.ph:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[F:%.*]], 0
 ; CHECK-NEXT:    [[CONV25:%.*]] = sext i16 [[E:%.*]] to i32
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT143:%.*]] = insertelement <4 x i32> poison, i32 [[CONV25]], i32 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND4_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND4_PREHEADER_PREHEADER:%.*]]
 ; CHECK:       for.cond4.preheader.us.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV25]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT144:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> zeroinitializer, [[BROADCAST_SPLAT144]]
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    [[BROADCAST_SPLAT144:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT143]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = mul <4 x i32> zeroinitializer, [[BROADCAST_SPLAT144]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 ; CHECK:       for.cond4.preheader.preheader:
 ; CHECK-NEXT:    ret <4 x i32> zeroinitializer
 ;
-- 
2.7.4