This patch adds the following DAGCombines, which apply if isVectorLoadExtDesirable() returns true:
- fold (and (masked_gather x)) -> (zext_masked_gather x)
- fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)
LowerMGATHER has also been updated to fetch the LoadExtType associated with the
gather and also use this value to determine the correct masked gather opcode to use.
Reviewed By: sdesmalen
Differential Revision: https://reviews.llvm.org/D92230
return false;
}
+static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy) {
+ if (!ScalarTy.isSimple())
+ return false;
+
+ uint64_t MaskForTy = 0ULL;
+ switch (ScalarTy.getSimpleVT().SimpleTy) {
+ case MVT::i8:
+ MaskForTy = 0xFFULL;
+ break;
+ case MVT::i16:
+ MaskForTy = 0xFFFFULL;
+ break;
+ case MVT::i32:
+ MaskForTy = 0xFFFFFFFFULL;
+ break;
+ default:
+ return false;
+ break;
+ }
+
+ APInt Val;
+ if (ISD::isConstantSplatVector(N, Val))
+ return Val.getLimitedValue() == MaskForTy;
+
+ return false;
+}
+
// Returns the SDNode if it is a constant float BuildVector
// or constant float.
static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) {
}
}
+ // fold (and (masked_gather x)) -> (zext_masked_gather x)
+ if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
+ EVT MemVT = GN0->getMemoryVT();
+ EVT ScalarVT = MemVT.getScalarType();
+
+ if (SDValue(GN0, 0).hasOneUse() &&
+ isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) &&
+ TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+ SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(),
+ GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()};
+
+ SDValue ZExtLoad = DAG.getMaskedGather(
+ DAG.getVTList(VT, MVT::Other), MemVT, SDLoc(N), Ops,
+ GN0->getMemOperand(), GN0->getIndexType(), ISD::ZEXTLOAD);
+
+ CombineTo(N, ZExtLoad);
+ AddToWorklist(ZExtLoad.getNode());
+ // Avoid recheck of N.
+ return SDValue(N, 0);
+ }
+ }
+
// fold (and (load x), 255) -> (zextload x, i8)
// fold (and (extload x, i16), 255) -> (zextload x, i8)
// fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
}
}
+ // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)
+ if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
+ if (SDValue(GN0, 0).hasOneUse() &&
+ ExtVT == GN0->getMemoryVT() &&
+ TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+ SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(),
+ GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()};
+
+ SDValue ExtLoad = DAG.getMaskedGather(
+ DAG.getVTList(VT, MVT::Other), ExtVT, SDLoc(N), Ops,
+ GN0->getMemOperand(), GN0->getIndexType(), ISD::SEXTLOAD);
+
+ CombineTo(N, ExtLoad);
+ CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+ AddToWorklist(ExtLoad.getNode());
+ return SDValue(N, 0); // Return N so it doesn't get rechecked!
+ }
+ }
+
// Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) {
if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
return AddrModes.find(Key)->second;
}
+unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ llvm_unreachable("unimplemented opcode");
+ return Opcode;
+ case AArch64ISD::GLD1_MERGE_ZERO:
+ return AArch64ISD::GLD1S_MERGE_ZERO;
+ case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+ return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
+ case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+ return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
+ case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+ return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
+ case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+ return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
+ case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+ return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
+ }
+}
+
bool getGatherScatterIndexIsExtended(SDValue Index) {
unsigned Opcode = Index.getOpcode();
if (Opcode == ISD::SIGN_EXTEND_INREG)
SDValue PassThru = MGT->getPassThru();
SDValue Mask = MGT->getMask();
SDValue BasePtr = MGT->getBasePtr();
+ ISD::LoadExtType ExtTy = MGT->getExtensionType();
ISD::MemIndexType IndexType = MGT->getIndexType();
bool IsScaled =
bool IdxNeedsExtend =
getGatherScatterIndexIsExtended(Index) ||
Index.getSimpleValueType().getVectorElementType() == MVT::i32;
+ bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
EVT VT = PassThru.getSimpleValueType();
EVT MemVT = MGT->getMemoryVT();
if (getGatherScatterIndexIsExtended(Index))
Index = Index.getOperand(0);
+ unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
+ if (ResNeedsSignExtend)
+ Opcode = getSignExtendedGatherOpcode(Opcode);
+
SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
- return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL,
- VTs, Ops);
+ return DAG.getNode(Opcode, DL, VTs, Ops);
}
SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unscaled unpacked 32-bit offsets
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
-; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxth z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1]
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2]
; CHECK-NEXT: ret
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
; CHECK-LABEL: masked_gather_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
-; CHECK-NEXT: and z0.s, z0.s, #0xffff
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
%vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: sxth z0.s, p0/m, z0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw #1]
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
%vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unscaled unpacked 32-bit offsets
; CHECK-LABEL: masked_gather_nxv2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT: and z0.d, z0.d, #0xff
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxth z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
; CHECK-LABEL: masked_gather_nxv4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT: and z0.s, z0.s, #0xff
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
%vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
; CHECK-LABEL: masked_gather_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT: and z0.s, z0.s, #0xffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
%vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: sxth z0.s, p0/m, z0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unscaled unpacked 32-bit offsets
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
-; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxth z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
; CHECK-LABEL: masked_gather_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
-; CHECK-NEXT: and z0.s, z0.s, #0xffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: sxth z0.s, p0/m, z0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw #1]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unscaled unpacked 32-bit offsets
; CHECK-LABEL: masked_gather_nxv2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT: and z0.d, z0.d, #0xff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxth z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
; CHECK-LABEL: masked_gather_nxv4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT: and z0.s, z0.s, #0xff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
; CHECK-LABEL: masked_gather_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT: and z0.s, z0.s, #0xffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: sxth z0.s, p0/m, z0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
-; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxth z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, lsl #1]
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, lsl #2]
; CHECK-NEXT: ret
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT: and z0.d, z0.d, #0xff
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxth z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
+
+; Test for multiple uses of the mgather where the s/zext should not be combined
+
+define <vscale x 2 x i64> @masked_sgather_sext(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask, <vscale x 2 x i8> %vals) {
+; CHECK-LABEL: masked_sgather_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxtb z2.d, p0/m, z0.d
+; CHECK-NEXT: add z0.d, z0.d, z1.d
+; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %data = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+ %data.sext = sext <vscale x 2 x i8> %data to <vscale x 2 x i64>
+ %add = add <vscale x 2 x i8> %data, %vals
+ %add.sext = sext <vscale x 2 x i8> %add to <vscale x 2 x i64>
+ %mul = mul <vscale x 2 x i64> %data.sext, %add.sext
+ ret <vscale x 2 x i64> %mul
+}
+
+define <vscale x 2 x i64> @masked_sgather_zext(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask, <vscale x 2 x i8> %vals) {
+; CHECK-LABEL: masked_sgather_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add z1.d, z0.d, z1.d
+; CHECK-NEXT: and z0.d, z0.d, #0xff
+; CHECK-NEXT: and z1.d, z1.d, #0xff
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %data = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+ %data.zext = zext <vscale x 2 x i8> %data to <vscale x 2 x i64>
+ %add = add <vscale x 2 x i8> %data, %vals
+ %add.zext = zext <vscale x 2 x i8> %add to <vscale x 2 x i64>
+ %mul = mul <vscale x 2 x i64> %data.zext, %add.zext
+ ret <vscale x 2 x i64> %mul
+}
; Tests that exercise various type legalisation scenarios for ISD::MGATHER.
define <vscale x 2 x i32> @masked_gather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK-DAG: mov x8, xzr
-; CHECK-DAG: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-DAG: ld1sw { z0.d }, p0/z, [x8, z0.d]
; CHECK: ret
%data = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
ret <vscale x 2 x i32> %data
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT: ld1b { z1.d }, p2/z, [x8, z1.d]
-; CHECK-NEXT: ld1b { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ld1sb { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x8, z0.d]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
; CHECK-NEXT: sxtb z0.s, p0/m, z0.s