[CGP,AArch64] Replace zexts with shuffle that can be lowered using tbl.
authorFlorian Hahn <flo@fhahn.com>
Thu, 15 Sep 2022 18:18:12 +0000 (19:18 +0100)
committerFlorian Hahn <flo@fhahn.com>
Thu, 15 Sep 2022 18:18:13 +0000 (19:18 +0100)
This patch extends CodeGenPrepare to lower zext v16i8 -> v16i32 in loops
using a wide shuffle  creating a v64i8 vector, selecting groups of 3
zero elements and an element from the input.

This is profitable on AArch64 where such shuffles can be lowered to tbl
instructions, but only in loops, because it requires materializing 4
masks, which can be done in the loop preheader.

This is the only reason the transform is part of CGP. If there's a
better alternative I missed, please let me know. The same goes for the
shouldReplaceZExtWithShuffle hook which guards this. I am not sure if
this transform will be beneficial on other targets, but it seems like
there is no way other convenient way.

This improves the generated code for loops like the one below in
combination with D96522.

    int foo(uint8_t *p, int N) {
      unsigned long long sum = 0;
      for (int i = 0; i < N ; i++, p++) {
unsigned int v = *p;
sum += (v < 127) ? v : 256 - v;
      }
      return sum;
    }

https://clang.godbolt.org/z/Wco866MjY

Reviewed By: t.p.northover

Differential Revision: https://reviews.llvm.org/D120571

llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/CodeGenPrepare.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/test/CodeGen/AArch64/vselect-ext.ll
llvm/test/CodeGen/AArch64/zext-to-tbl.ll
llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll

index 3495e3c..7f6580d 100644 (file)
@@ -67,6 +67,7 @@ class Constant;
 class FastISel;
 class FunctionLoweringInfo;
 class GlobalValue;
+class Loop;
 class GISelKnownBits;
 class IntrinsicInst;
 class IRBuilderBase;
@@ -2798,6 +2799,13 @@ public:
     return false;
   }
 
+  /// Try to optimize extending or truncating conversion instructions (like
+  /// zext, trunc, fptoui, uitofp) for the target.
+  virtual bool optimizeExtendOrTruncateConversion(Instruction *I,
+                                                  Loop *L) const {
+    return false;
+  }
+
   /// Return true if the target supplies and combines to a paired load
   /// two loaded values of type LoadedType next to each other in memory.
   /// RequiredAlignment gives the minimal alignment constraints that must be met
index 11f284b..9e77f79 100644 (file)
@@ -8055,6 +8055,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
           TargetLowering::TypeExpandInteger) {
         return SinkCast(CI);
       } else {
+        if (TLI->optimizeExtendOrTruncateConversion(
+                I, LI->getLoopFor(I->getParent())))
+          return true;
+
         bool MadeChange = optimizeExt(I);
         return MadeChange | optimizeExtUses(I);
       }
index f1c48f9..0de2645 100644 (file)
@@ -29,6 +29,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -13183,6 +13184,60 @@ bool AArch64TargetLowering::shouldSinkOperands(
   return false;
 }
 
+static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) {
+  Value *Op = ZExt->getOperand(0);
+  auto *SrcTy = dyn_cast<FixedVectorType>(Op->getType());
+  auto *DstTy = dyn_cast<FixedVectorType>(ZExt->getType());
+  unsigned NumElts = SrcTy->getNumElements();
+  IRBuilder<> Builder(ZExt);
+  SmallVector<int> Mask(4 * NumElts, NumElts);
+  // Create a mask that selects <0,0,0,Op[i]> for each lane of vector of i32 to
+  // replace the original ZExt. This can later be lowered to a set of tbl
+  // instructions.
+  for (unsigned i = 0; i < NumElts; i++) {
+    if (IsLittleEndian)
+      Mask[i * 4] = i;
+    else
+      Mask[i * 4 + 3] = i;
+  }
+
+  auto *FirstEltZero = Builder.CreateInsertElement(
+      PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
+  Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
+  Result = Builder.CreateBitCast(Result, DstTy);
+  ZExt->replaceAllUsesWith(Result);
+  ZExt->eraseFromParent();
+}
+
+bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
+                                                               Loop *L) const {
+  // Try to optimize conversions using tbl. This requires materializing constant
+  // index vectors, which can increase code size and add loads. Skip the
+  // transform unless the conversion is in a loop block guaranteed to execute
+  // and we are not optimizing for size.
+  Function *F = I->getParent()->getParent();
+  if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
+      F->hasOptSize())
+    return false;
+
+  auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
+  auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
+  if (!SrcTy || !DstTy)
+    return false;
+
+  // Convert 'zext <(8|16) x i8> %x to <(8|16) x i32>' to a shuffle that can be
+  // lowered to either 2 or 4 tbl instructions to insert the original i8
+  // elements into i32 lanes.
+  auto *ZExt = dyn_cast<ZExtInst>(I);
+  if (ZExt && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
+      SrcTy->getElementType()->isIntegerTy(8) &&
+      DstTy->getElementType()->isIntegerTy(32)) {
+    createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
+    return true;
+  }
+  return false;
+}
+
 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
                                           Align &RequiredAligment) const {
   if (!LoadedType.isSimple() ||
index a5552ca..7b1b0e5 100644 (file)
@@ -606,6 +606,9 @@ public:
   bool shouldSinkOperands(Instruction *I,
                           SmallVectorImpl<Use *> &Ops) const override;
 
+  bool optimizeExtendOrTruncateConversion(Instruction *I,
+                                          Loop *L) const override;
+
   bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override;
 
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
index fa0db82..6f16bc0 100644 (file)
@@ -573,35 +573,53 @@ entry:
 define void @extension_in_loop_v16i8_to_v16i32(i8* %src, i32* %dst) {
 ; CHECK-LABEL: extension_in_loop_v16i8_to_v16i32:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    movi.2d v0, #0xffffffffffffffff
+; CHECK-NEXT:  Lloh2:
+; CHECK-NEXT:    adrp x9, lCPI24_0@PAGE
+; CHECK-NEXT:  Lloh3:
+; CHECK-NEXT:    adrp x10, lCPI24_1@PAGE
+; CHECK-NEXT:  Lloh4:
+; CHECK-NEXT:    adrp x11, lCPI24_2@PAGE
+; CHECK-NEXT:  Lloh5:
+; CHECK-NEXT:    adrp x12, lCPI24_3@PAGE
+; CHECK-NEXT:    movi.2d v2, #0xffffffffffffffff
 ; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  Lloh6:
+; CHECK-NEXT:    ldr q0, [x9, lCPI24_0@PAGEOFF]
+; CHECK-NEXT:  Lloh7:
+; CHECK-NEXT:    ldr q1, [x10, lCPI24_1@PAGEOFF]
+; CHECK-NEXT:  Lloh8:
+; CHECK-NEXT:    ldr q3, [x11, lCPI24_2@PAGEOFF]
+; CHECK-NEXT:  Lloh9:
+; CHECK-NEXT:    ldr q4, [x12, lCPI24_3@PAGEOFF]
 ; CHECK-NEXT:  LBB24_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q1, [x0, x8]
+; CHECK-NEXT:    ldr q5, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    cmgt.16b v2, v1, v0
-; CHECK-NEXT:    ushll2.8h v3, v1, #0
-; CHECK-NEXT:    sshll2.8h v4, v2, #0
-; CHECK-NEXT:    ushll2.4s v5, v3, #0
-; CHECK-NEXT:    ushll.4s v3, v3, #0
-; CHECK-NEXT:    sshll2.4s v6, v4, #0
-; CHECK-NEXT:    sshll.4s v4, v4, #0
-; CHECK-NEXT:    ushll.8h v1, v1, #0
-; CHECK-NEXT:    sshll.8h v2, v2, #0
+; CHECK-NEXT:    cmgt.16b v6, v5, v2
+; CHECK-NEXT:    tbl.16b v7, { v5 }, v0
+; CHECK-NEXT:    tbl.16b v16, { v5 }, v1
+; CHECK-NEXT:    sshll2.8h v18, v6, #0
+; CHECK-NEXT:    tbl.16b v17, { v5 }, v3
+; CHECK-NEXT:    sshll2.4s v19, v18, #0
+; CHECK-NEXT:    sshll.4s v18, v18, #0
+; CHECK-NEXT:    tbl.16b v5, { v5 }, v4
+; CHECK-NEXT:    sshll.8h v6, v6, #0
+; CHECK-NEXT:    and.16b v7, v7, v19
+; CHECK-NEXT:    and.16b v16, v16, v18
+; CHECK-NEXT:    stp q16, q7, [x1, #32]
+; CHECK-NEXT:    sshll2.4s v7, v6, #0
+; CHECK-NEXT:    sshll.4s v6, v6, #0
+; CHECK-NEXT:    and.16b v7, v17, v7
 ; CHECK-NEXT:    and.16b v5, v5, v6
-; CHECK-NEXT:    and.16b v3, v3, v4
-; CHECK-NEXT:    stp q3, q5, [x1, #32]
-; CHECK-NEXT:    sshll2.4s v4, v2, #0
-; CHECK-NEXT:    sshll.4s v2, v2, #0
-; CHECK-NEXT:    ushll2.4s v3, v1, #0
-; CHECK-NEXT:    ushll.4s v1, v1, #0
-; CHECK-NEXT:    and.16b v3, v3, v4
-; CHECK-NEXT:    and.16b v1, v1, v2
-; CHECK-NEXT:    stp q1, q3, [x1], #64
+; CHECK-NEXT:    stp q5, q7, [x1], #64
 ; CHECK-NEXT:    b.ne LBB24_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh5, Lloh9
+; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh8
+; CHECK-NEXT:    .loh AdrpLdr Lloh3, Lloh7
+; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh6
 entry:
   br label %loop
 
@@ -627,23 +645,23 @@ exit:
 define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(i8* %src, i32* %dst) {
 ; CHECK-LABEL: extension_in_loop_as_shuffle_v16i8_to_v16i32:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:  Lloh2:
+; CHECK-NEXT:  Lloh10:
 ; CHECK-NEXT:    adrp x9, lCPI25_0@PAGE
-; CHECK-NEXT:  Lloh3:
+; CHECK-NEXT:  Lloh11:
 ; CHECK-NEXT:    adrp x10, lCPI25_1@PAGE
-; CHECK-NEXT:  Lloh4:
+; CHECK-NEXT:  Lloh12:
 ; CHECK-NEXT:    adrp x11, lCPI25_2@PAGE
-; CHECK-NEXT:  Lloh5:
+; CHECK-NEXT:  Lloh13:
 ; CHECK-NEXT:    adrp x12, lCPI25_3@PAGE
 ; CHECK-NEXT:    movi.2d v2, #0xffffffffffffffff
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh6:
+; CHECK-NEXT:  Lloh14:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI25_0@PAGEOFF]
-; CHECK-NEXT:  Lloh7:
+; CHECK-NEXT:  Lloh15:
 ; CHECK-NEXT:    ldr q1, [x10, lCPI25_1@PAGEOFF]
-; CHECK-NEXT:  Lloh8:
+; CHECK-NEXT:  Lloh16:
 ; CHECK-NEXT:    ldr q3, [x11, lCPI25_2@PAGEOFF]
-; CHECK-NEXT:  Lloh9:
+; CHECK-NEXT:  Lloh17:
 ; CHECK-NEXT:    ldr q4, [x12, lCPI25_3@PAGEOFF]
 ; CHECK-NEXT:  LBB25_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -670,10 +688,10 @@ define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(i8* %src, i32* %dst) {
 ; CHECK-NEXT:    b.ne LBB25_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh5, Lloh9
-; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh8
-; CHECK-NEXT:    .loh AdrpLdr Lloh3, Lloh7
-; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh6
+; CHECK-NEXT:    .loh AdrpLdr Lloh13, Lloh17
+; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh16
+; CHECK-NEXT:    .loh AdrpLdr Lloh11, Lloh15
+; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh14
 entry:
   br label %loop
 
@@ -700,23 +718,23 @@ exit:
 define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(i8* %src, i32* %dst) {
 ; CHECK-LABEL: shuffle_in_loop_is_no_extend_v16i8_to_v16i32:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:  Lloh10:
+; CHECK-NEXT:  Lloh18:
 ; CHECK-NEXT:    adrp x9, lCPI26_0@PAGE
-; CHECK-NEXT:  Lloh11:
+; CHECK-NEXT:  Lloh19:
 ; CHECK-NEXT:    adrp x10, lCPI26_1@PAGE
-; CHECK-NEXT:  Lloh12:
+; CHECK-NEXT:  Lloh20:
 ; CHECK-NEXT:    adrp x11, lCPI26_2@PAGE
-; CHECK-NEXT:  Lloh13:
+; CHECK-NEXT:  Lloh21:
 ; CHECK-NEXT:    adrp x12, lCPI26_3@PAGE
 ; CHECK-NEXT:    movi.2d v2, #0xffffffffffffffff
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh14:
+; CHECK-NEXT:  Lloh22:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI26_0@PAGEOFF]
-; CHECK-NEXT:  Lloh15:
+; CHECK-NEXT:  Lloh23:
 ; CHECK-NEXT:    ldr q1, [x10, lCPI26_1@PAGEOFF]
-; CHECK-NEXT:  Lloh16:
+; CHECK-NEXT:  Lloh24:
 ; CHECK-NEXT:    ldr q3, [x11, lCPI26_2@PAGEOFF]
-; CHECK-NEXT:  Lloh17:
+; CHECK-NEXT:  Lloh25:
 ; CHECK-NEXT:    ldr q4, [x12, lCPI26_3@PAGEOFF]
 ; CHECK-NEXT:  LBB26_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -743,10 +761,10 @@ define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(i8* %src, i32* %dst) {
 ; CHECK-NEXT:    b.ne LBB26_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh13, Lloh17
-; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh16
-; CHECK-NEXT:    .loh AdrpLdr Lloh11, Lloh15
-; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh14
+; CHECK-NEXT:    .loh AdrpLdr Lloh21, Lloh25
+; CHECK-NEXT:    .loh AdrpLdr Lloh20, Lloh24
+; CHECK-NEXT:    .loh AdrpLdr Lloh19, Lloh23
+; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh22
 entry:
   br label %loop
 
index 0daf371..6d2615e 100644 (file)
 ; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64_be-unknown-linux -o - %s | FileCheck --check-prefix=CHECK-BE %s
 
+; CHECK-LABEL: lCPI0_0:
+; CHECK-NEXT:    .byte   0                               ; 0x0
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   1                               ; 0x1
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   2                               ; 0x2
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   3                               ; 0x3
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:lCPI0_1:
+; CHECK-NEXT:    .byte   4                               ; 0x4
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   5                               ; 0x5
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   6                               ; 0x6
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   7                               ; 0x7
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:lCPI0_2:
+; CHECK-NEXT:    .byte   8                               ; 0x8
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   9                               ; 0x9
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   10                              ; 0xa
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   11                              ; 0xb
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:lCPI0_3:
+; CHECK-NEXT:    .byte   12                              ; 0xc
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   13                              ; 0xd
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   14                              ; 0xe
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   15                              ; 0xf
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+; CHECK-NEXT:    .byte   255                             ; 0xff
+
+; CHECK-BE: .LCPI0_0:
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   0                               // 0x0
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   1                               // 0x1
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   2                               // 0x2
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   3                               // 0x3
+; CHECK-BE-NEXT: .LCPI0_1:
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   4                               // 0x4
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   5                               // 0x5
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   6                               // 0x6
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   7                               // 0x7
+; CHECK-BE-NEXT: .LCPI0_2:
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   8                               // 0x8
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   9                               // 0x9
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   10                              // 0xa
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   11                              // 0xb
+; CHECK-BE-NEXT: .LCPI0_3:
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   12                              // 0xc
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   13                              // 0xd
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   14                              // 0xe
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   15                              // 0xf
+
 ; It's profitable to convert the zext to a shuffle, which in turn will be
 ; lowered to 4 tbl instructions. The masks are materialized outside the loop.
 define void @zext_v16i8_to_v16i32_in_loop(i8* %src, i32* %dst) {
 ; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x9, lCPI0_0@PAGE
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    adrp x10, lCPI0_1@PAGE
+; CHECK-NEXT:  Lloh2:
+; CHECK-NEXT:    adrp x11, lCPI0_2@PAGE
+; CHECK-NEXT:  Lloh3:
+; CHECK-NEXT:    adrp x12, lCPI0_3@PAGE
 ; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  Lloh4:
+; CHECK-NEXT:    ldr q0, [x9, lCPI0_0@PAGEOFF]
+; CHECK-NEXT:  Lloh5:
+; CHECK-NEXT:    ldr q1, [x10, lCPI0_1@PAGEOFF]
+; CHECK-NEXT:  Lloh6:
+; CHECK-NEXT:    ldr q2, [x11, lCPI0_2@PAGEOFF]
+; CHECK-NEXT:  Lloh7:
+; CHECK-NEXT:    ldr q3, [x12, lCPI0_3@PAGEOFF]
 ; CHECK-NEXT:  LBB0_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    ldr q4, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    ushll2.8h v1, v0, #0
-; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    ushll2.4s v2, v1, #0
-; CHECK-NEXT:    ushll.4s v1, v1, #0
-; CHECK-NEXT:    ushll2.4s v3, v0, #0
-; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    stp q1, q2, [x1, #32]
-; CHECK-NEXT:    stp q0, q3, [x1], #64
+; CHECK-NEXT:    tbl.16b v5, { v4 }, v3
+; CHECK-NEXT:    tbl.16b v6, { v4 }, v2
+; CHECK-NEXT:    tbl.16b v7, { v4 }, v1
+; CHECK-NEXT:    tbl.16b v4, { v4 }, v0
+; CHECK-NEXT:    stp q6, q5, [x1, #32]
+; CHECK-NEXT:    stp q4, q7, [x1], #64
 ; CHECK-NEXT:    b.ne LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh3, Lloh7
+; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh6
+; CHECK-NEXT:    .loh AdrpLdr Lloh1, Lloh5
+; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh4
 ;
 ; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop:
 ; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    adrp x8, .LCPI0_0
+; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI0_0
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x8]
+; CHECK-BE-NEXT:    adrp x8, .LCPI0_1
+; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI0_1
+; CHECK-BE-NEXT:    ld1 { v1.16b }, [x8]
+; CHECK-BE-NEXT:    adrp x8, .LCPI0_2
+; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI0_2
+; CHECK-BE-NEXT:    ld1 { v2.16b }, [x8]
+; CHECK-BE-NEXT:    adrp x8, .LCPI0_3
+; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI0_3
+; CHECK-BE-NEXT:    ld1 { v3.16b }, [x8]
 ; CHECK-BE-NEXT:    mov x8, xzr
 ; CHECK-BE-NEXT:  .LBB0_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -34,20 +202,18 @@ define void @zext_v16i8_to_v16i32_in_loop(i8* %src, i32* %dst) {
 ; CHECK-BE-NEXT:    add x10, x1, #32
 ; CHECK-BE-NEXT:    add x8, x8, #16
 ; CHECK-BE-NEXT:    cmp x8, #128
-; CHECK-BE-NEXT:    ld1 { v0.16b }, [x9]
+; CHECK-BE-NEXT:    ld1 { v4.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #48
-; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BE-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-BE-NEXT:    st1 { v2.4s }, [x9]
+; CHECK-BE-NEXT:    tbl v5.16b, { v4.16b }, v3.16b
+; CHECK-BE-NEXT:    tbl v6.16b, { v4.16b }, v0.16b
+; CHECK-BE-NEXT:    tbl v7.16b, { v4.16b }, v2.16b
+; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v1.16b
+; CHECK-BE-NEXT:    st1 { v5.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #16
-; CHECK-BE-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x10]
-; CHECK-BE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-BE-NEXT:    st1 { v2.4s }, [x1]
+; CHECK-BE-NEXT:    st1 { v6.16b }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #64
-; CHECK-BE-NEXT:    st1 { v0.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v7.16b }, [x10]
+; CHECK-BE-NEXT:    st1 { v4.16b }, [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB0_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -393,39 +559,123 @@ exit:
   ret void
 }
 
+; CHECK-LABEL: lCPI6_0:
+; CHECK-NEXT:     .byte   0                               ; 0x0
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   1                               ; 0x1
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   2                               ; 0x2
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   3                               ; 0x3
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT: lCPI6_1:
+; CHECK-NEXT:     .byte   4                               ; 0x4
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   5                               ; 0x5
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   6                               ; 0x6
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   7                               ; 0x7
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-NEXT:     .byte   255                             ; 0xff
+
+; CHECK-BE:       .LCPI6_0:
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   0                               // 0x0
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   1                               // 0x1
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   2                               // 0x2
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   3                               // 0x3
+; CHECK-BE-NEXT: .LCPI6_1:
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   4                               // 0x4
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   5                               // 0x5
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   6                               // 0x6
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   255                             // 0xff
+; CHECK-BE-NEXT:       .byte   7                               // 0x7
+
 define void @zext_v8i8_to_v8i32_in_loop(i8* %src, i32* %dst) {
 ; CHECK-LABEL: zext_v8i8_to_v8i32_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:  Lloh8:
+; CHECK-NEXT:    adrp x9, lCPI6_0@PAGE
+; CHECK-NEXT:  Lloh9:
+; CHECK-NEXT:    adrp x10, lCPI6_1@PAGE
 ; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  Lloh10:
+; CHECK-NEXT:    ldr q0, [x9, lCPI6_0@PAGEOFF]
+; CHECK-NEXT:  Lloh11:
+; CHECK-NEXT:    ldr q1, [x10, lCPI6_1@PAGEOFF]
 ; CHECK-NEXT:  LBB6_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr d0, [x0, x8]
+; CHECK-NEXT:    ldr d2, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    ushll2.4s v1, v0, #0
-; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    stp q0, q1, [x1], #64
+; CHECK-NEXT:    tbl.16b v3, { v2 }, v1
+; CHECK-NEXT:    tbl.16b v2, { v2 }, v0
+; CHECK-NEXT:    stp q2, q3, [x1], #64
 ; CHECK-NEXT:    b.ne LBB6_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh9, Lloh11
+; CHECK-NEXT:    .loh AdrpLdr Lloh8, Lloh10
 ;
 ; CHECK-BE-LABEL: zext_v8i8_to_v8i32_in_loop:
 ; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    adrp x8, .LCPI6_0
+; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI6_0
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x8]
+; CHECK-BE-NEXT:    adrp x8, .LCPI6_1
+; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI6_1
+; CHECK-BE-NEXT:    ld1 { v1.16b }, [x8]
 ; CHECK-BE-NEXT:    mov x8, xzr
 ; CHECK-BE-NEXT:  .LBB6_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
 ; CHECK-BE-NEXT:    add x8, x8, #16
 ; CHECK-BE-NEXT:    cmp x8, #128
-; CHECK-BE-NEXT:    ld1 { v0.8b }, [x9]
+; CHECK-BE-NEXT:    ld1 { v2.8b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #16
-; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BE-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-BE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x1]
+; CHECK-BE-NEXT:    tbl v3.16b, { v2.16b }, v0.16b
+; CHECK-BE-NEXT:    tbl v2.16b, { v2.16b }, v1.16b
+; CHECK-BE-NEXT:    st1 { v3.16b }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #64
-; CHECK-BE-NEXT:    st1 { v0.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v2.16b }, [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB6_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
index 77dfcd2..7de6718 100644 (file)
@@ -15,10 +15,11 @@ define void @zext_v16i8_to_v16i32_in_loop(i8* %src, i32* %dst) {
 ; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <16 x i8>*
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_GEP_CAST]], align 16
-; CHECK-NEXT:    [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <64 x i32> <i32 0, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 2, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 5, i32 16, i32 16, i32 16, i32 6, i32 16, i32 16, i32 16, i32 7, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 9, i32 16, i32 16, i32 16, i32 10, i32 16, i32 16, i32 16, i32 11, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 15, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <64 x i8> [[TMP0]] to <16 x i32>
 ; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[DST_GEP_CAST:%.*]] = bitcast i32* [[DST_GEP]] to <16 x i32>*
-; CHECK-NEXT:    store <16 x i32> [[EXT]], <16 x i32>* [[DST_GEP_CAST]], align 64
+; CHECK-NEXT:    store <16 x i32> [[TMP1]], <16 x i32>* [[DST_GEP_CAST]], align 64
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]