[SVE][CodeGen] Add DAG combines for s/zext_masked_gather

author Kerry McLaughlin <kerry.mclaughlin@arm.com>

Wed, 9 Dec 2020 11:21:51 +0000 (11:21 +0000)

committer Kerry McLaughlin <kerry.mclaughlin@arm.com>

Wed, 9 Dec 2020 11:53:19 +0000 (11:53 +0000)
author Kerry McLaughlin <kerry.mclaughlin@arm.com>
Wed, 9 Dec 2020 11:21:51 +0000 (11:21 +0000)
committer Kerry McLaughlin <kerry.mclaughlin@arm.com>
Wed, 9 Dec 2020 11:53:19 +0000 (11:53 +0000)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index ce4ee89..212e0a2 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -932,6 +932,33 @@ bool DAGCombiner::isOneUseSetCC(SDValue N) const {
    return false;
  }
  
+static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy) {
+  if (!ScalarTy.isSimple())
+    return false;
+
+  uint64_t MaskForTy = 0ULL;
+  switch (ScalarTy.getSimpleVT().SimpleTy) {
+  case MVT::i8:
+    MaskForTy = 0xFFULL;
+    break;
+  case MVT::i16:
+    MaskForTy = 0xFFFFULL;
+    break;
+  case MVT::i32:
+    MaskForTy = 0xFFFFFFFFULL;
+    break;
+  default:
+    return false;
+    break;
+  }
+
+  APInt Val;
+  if (ISD::isConstantSplatVector(N, Val))
+    return Val.getLimitedValue() == MaskForTy;
+
+  return false;
+}
+
  // Returns the SDNode if it is a constant float BuildVector
  // or constant float.
  static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) {
@@ -5622,6 +5649,28 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
      }
    }
  
+  // fold (and (masked_gather x)) -> (zext_masked_gather x)
+  if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
+    EVT MemVT = GN0->getMemoryVT();
+    EVT ScalarVT = MemVT.getScalarType();
+
+    if (SDValue(GN0, 0).hasOneUse() &&
+        isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) &&
+        TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+      SDValue Ops[] = {GN0->getChain(),   GN0->getPassThru(), GN0->getMask(),
+                       GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
+
+      SDValue ZExtLoad = DAG.getMaskedGather(
+          DAG.getVTList(VT, MVT::Other), MemVT, SDLoc(N), Ops,
+          GN0->getMemOperand(), GN0->getIndexType(), ISD::ZEXTLOAD);
+
+      CombineTo(N, ZExtLoad);
+      AddToWorklist(ZExtLoad.getNode());
+      // Avoid recheck of N.
+      return SDValue(N, 0);
+    }
+  }
+
    // fold (and (load x), 255) -> (zextload x, i8)
    // fold (and (extload x, i16), 255) -> (zextload x, i8)
    // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
@@ -11597,6 +11646,25 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
      }
    }
  
+  // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)
+  if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
+    if (SDValue(GN0, 0).hasOneUse() &&
+        ExtVT == GN0->getMemoryVT() &&
+        TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+      SDValue Ops[] = {GN0->getChain(),   GN0->getPassThru(), GN0->getMask(),
+                       GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
+
+      SDValue ExtLoad = DAG.getMaskedGather(
+          DAG.getVTList(VT, MVT::Other), ExtVT, SDLoc(N), Ops,
+          GN0->getMemOperand(), GN0->getIndexType(), ISD::SEXTLOAD);
+
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+      AddToWorklist(ExtLoad.getNode());
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
+    }
+  }
+
    // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
    if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) {
      if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

index 20f5ded..5d9c66e 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3836,6 +3836,26 @@ unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
    return AddrModes.find(Key)->second;
  }
  
+unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("unimplemented opcode");
+    return Opcode;
+  case AArch64ISD::GLD1_MERGE_ZERO:
+    return AArch64ISD::GLD1S_MERGE_ZERO;
+  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+    return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
+  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+    return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
+  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+    return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
+  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+    return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
+  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+    return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
+  }
+}
+
  bool getGatherScatterIndexIsExtended(SDValue Index) {
    unsigned Opcode = Index.getOpcode();
    if (Opcode == ISD::SIGN_EXTEND_INREG)
@@ -3865,6 +3885,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
    SDValue PassThru = MGT->getPassThru();
    SDValue Mask = MGT->getMask();
    SDValue BasePtr = MGT->getBasePtr();
+  ISD::LoadExtType ExtTy = MGT->getExtensionType();
  
    ISD::MemIndexType IndexType = MGT->getIndexType();
    bool IsScaled =
@@ -3874,6 +3895,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
    bool IdxNeedsExtend =
        getGatherScatterIndexIsExtended(Index) ||
        Index.getSimpleValueType().getVectorElementType() == MVT::i32;
+  bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
  
    EVT VT = PassThru.getSimpleValueType();
    EVT MemVT = MGT->getMemoryVT();
@@ -3900,9 +3922,12 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
    if (getGatherScatterIndexIsExtended(Index))
      Index = Index.getOperand(0);
  
+  unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
+  if (ResNeedsSignExtend)
+    Opcode = getSignExtendedGatherOpcode(Opcode);
+
    SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
-  return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL,
-                     VTs, Ops);
+  return DAG.getNode(Opcode, DL, VTs, Ops);
  }
  
  SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll

index 32dca0d..e6b89b0 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
@@ -1,5 +1,6 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
  
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ; unscaled unpacked 32-bit offsets
@@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32>
  ; CHECK-LABEL: masked_gather_nxv2i16:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
    %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@@ -21,7 +21,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32>
  ; CHECK-LABEL: masked_gather_nxv2i32:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
    %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
@@ -72,9 +71,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
  define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i16:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1]
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
    %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@@ -85,9 +82,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32>
  define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i32:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2]
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
    %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
@@ -103,7 +98,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32>
  ; CHECK-LABEL: masked_gather_nxv4i16:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
-; CHECK-NEXT:    and z0.s, z0.s, #0xffff
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
    %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
@@ -144,9 +138,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i
  define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv4i16:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw #1]
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
    %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll

index 1fc048a..2d4ce50 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
@@ -1,5 +1,6 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
  
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ; unscaled unpacked 32-bit offsets
@@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %o
  ; CHECK-LABEL: masked_gather_nxv2i8:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xff
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
    %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@@ -21,7 +21,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %
  ; CHECK-LABEL: masked_gather_nxv2i16:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
  ; CHECK-NEXT:    ret
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
    %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@@ -34,7 +33,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %
  ; CHECK-LABEL: masked_gather_nxv2i32:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
  ; CHECK-NEXT:    ret
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
    %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@@ -90,9 +88,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32
  define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i8:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d, sxtw]
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
    %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@@ -103,9 +99,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %
  define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i16:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw]
  ; CHECK-NEXT:    ret
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
    %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@@ -117,9 +111,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32>
  define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i32:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw]
  ; CHECK-NEXT:    ret
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
    %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@@ -136,7 +128,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %o
  ; CHECK-LABEL: masked_gather_nxv4i8:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT:    and z0.s, z0.s, #0xff
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
    %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
@@ -148,7 +139,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %
  ; CHECK-LABEL: masked_gather_nxv4i16:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT:    and z0.s, z0.s, #0xffff
  ; CHECK-NEXT:    ret
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
    %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
@@ -193,9 +183,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32>
  define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv4i8:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw]
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
    %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
@@ -206,9 +194,7 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %
  define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv4i16:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw]
  ; CHECK-NEXT:    ret
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
    %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll

index ada49b7..41f1eb4 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
@@ -1,5 +1,6 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
  
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ; unscaled unpacked 32-bit offsets
@@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32>
  ; CHECK-LABEL: masked_gather_nxv2i16:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
    %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
@@ -22,7 +22,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32>
  ; CHECK-LABEL: masked_gather_nxv2i32:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
    %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
@@ -78,9 +77,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
  define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i16:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1]
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
    %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
@@ -92,9 +89,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32>
  define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i32:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2]
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
    %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
@@ -111,7 +106,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32>
  ; CHECK-LABEL: masked_gather_nxv4i16:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
-; CHECK-NEXT:    and z0.s, z0.s, #0xffff
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
    %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
@@ -156,9 +150,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i
  define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv4i16:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw #1]
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
    %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll

index 61b8e3e..51ab73c 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
@@ -1,5 +1,6 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
  
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ; unscaled unpacked 32-bit offsets
@@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %o
  ; CHECK-LABEL: masked_gather_nxv2i8:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xff
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
    %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -22,7 +22,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %
  ; CHECK-LABEL: masked_gather_nxv2i16:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -36,7 +35,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %
  ; CHECK-LABEL: masked_gather_nxv2i32:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -97,9 +95,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32
  define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i8:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw]
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
    %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -111,9 +107,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %
  define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i16:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -126,9 +120,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32>
  define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i32:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw]
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -146,7 +138,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %o
  ; CHECK-LABEL: masked_gather_nxv4i8:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT:    and z0.s, z0.s, #0xff
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
    %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@@ -159,7 +150,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %
  ; CHECK-LABEL: masked_gather_nxv4i16:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT:    and z0.s, z0.s, #0xffff
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@@ -208,9 +198,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32>
  define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv4i8:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0, z0.s, uxtw]
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
    %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@@ -222,9 +210,7 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %
  define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv4i16:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw]
  ; CHECK-NEXT:    ret
    %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll

index 197ed69..15dfcc6 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
@@ -1,11 +1,11 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
  
  define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_gather_nxv2i16:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
    %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@@ -17,7 +17,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i64>
  ; CHECK-LABEL: masked_gather_nxv2i32:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
    %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
@@ -68,9 +67,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
  define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i16:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, lsl #1]
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
    %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@@ -81,9 +78,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i64>
  define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i32:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, lsl #2]
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
    %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll

index 3f4f54c..3320b88 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
@@ -1,11 +1,11 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
  
  define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_gather_nxv2i8:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    and z0.d, z0.d, #0xff
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
    %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@@ -17,7 +17,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i64> %
  ; CHECK-LABEL: masked_gather_nxv2i16:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
  ; CHECK-NEXT:    ret
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
    %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@@ -30,7 +29,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i64> %
  ; CHECK-LABEL: masked_gather_nxv2i32:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
  ; CHECK-NEXT:    ret
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
    %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@@ -86,9 +84,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i64
  define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i8:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d]
  ; CHECK-NEXT:    ret
    %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
    %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@@ -99,9 +95,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> %
  define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i16:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d]
  ; CHECK-NEXT:    ret
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
    %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@@ -113,9 +107,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64>
  define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_sgather_nxv2i32:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d]
  ; CHECK-NEXT:    ret
    %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
    %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll

index 962ba07..076edc1 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
@@ -1,5 +1,46 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
+
+; Test for multiple uses of the mgather where the s/zext should not be combined
+
+define <vscale x 2 x i64> @masked_sgather_sext(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask, <vscale x 2 x i8> %vals) {
+; CHECK-LABEL: masked_sgather_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtb z2.d, p0/m, z0.d
+; CHECK-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %data = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %data.sext = sext <vscale x 2 x i8> %data to <vscale x 2 x i64>
+  %add = add <vscale x 2 x i8> %data, %vals
+  %add.sext = sext <vscale x 2 x i8> %add to <vscale x 2 x i64>
+  %mul = mul <vscale x 2 x i64> %data.sext, %add.sext
+  ret <vscale x 2 x i64> %mul
+}
+
+define <vscale x 2 x i64> @masked_sgather_zext(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask, <vscale x 2 x i8> %vals) {
+; CHECK-LABEL: masked_sgather_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add z1.d, z0.d, z1.d
+; CHECK-NEXT: and z0.d, z0.d, #0xff
+; CHECK-NEXT: and z1.d, z1.d, #0xff
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %data = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %data.zext = zext <vscale x 2 x i8> %data to <vscale x 2 x i64>
+  %add = add <vscale x 2 x i8> %data, %vals
+  %add.zext = zext <vscale x 2 x i8> %add to <vscale x 2 x i64>
+  %mul = mul <vscale x 2 x i64> %data.zext, %add.zext
+  ret <vscale x 2 x i64> %mul
+}
  
  ; Tests that exercise various type legalisation scenarios for ISD::MGATHER.
  
@@ -7,7 +48,7 @@
  define <vscale x 2 x i32> @masked_gather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %mask) {
  ; CHECK-LABEL: masked_gather_nxv2i32:
  ; CHECK-DAG: mov x8, xzr
-; CHECK-DAG: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-DAG: ld1sw { z0.d }, p0/z, [x8, z0.d]
  ; CHECK:     ret
    %data = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
    ret <vscale x 2 x i32> %data
@@ -41,8 +82,8 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(<vscale x 4 x i8*> %ptrs, <vsca
  ; CHECK-NEXT:    mov x8, xzr
  ; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
  ; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT:    ld1b { z1.d }, p2/z, [x8, z1.d]
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ld1sb { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x8, z0.d]
  ; CHECK-NEXT:    ptrue p0.s
  ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
  ; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
author	Kerry McLaughlin <kerry.mclaughlin@arm.com>
	Wed, 9 Dec 2020 11:21:51 +0000 (11:21 +0000)
committer	Kerry McLaughlin <kerry.mclaughlin@arm.com>
	Wed, 9 Dec 2020 11:53:19 +0000 (11:53 +0000)
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll		patch \| blob \| history