From d4a6bf4d1a915d3b0566f38e13ea90b597f6cfe3 Mon Sep 17 00:00:00 2001 From: Peter Waller Date: Thu, 20 Jan 2022 12:01:23 +0000 Subject: [PATCH] Revert "[AArch64][SVE][VLS] Move extends into arguments of comparisons" This reverts commit db04d3e30b3878ae39ef64eb0b0a1538644c7f6a, which causes a buildbot failure. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 39 ---- .../AArch64/sve-fixed-length-masked-loads.ll | 196 ++++++++++++++------- 2 files changed, 130 insertions(+), 105 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 450ff60..8f85c93 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15338,40 +15338,6 @@ static SDValue performIntrinsicCombine(SDNode *N, return SDValue(); } -static bool isCheapToExtend(const SDValue &N) { - unsigned OC = N->getOpcode(); - return OC == ISD::LOAD || OC == ISD::MLOAD || - ISD::isConstantSplatVectorAllZeros(N.getNode()); -} - -static SDValue -performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - // If we have (sext (setcc A B)) and A and B are cheap to extend, - // we can move the sext into the arguments and have the same result. For - // example, if A and B are both loads, we can make those extending loads and - // avoid an extra instruction. This pattern appears often in VLS code - // generation where the inputs to the setcc have a different size to the - // instruction that wants to use the result of the setcc. - assert(N->getOpcode() == ISD::SIGN_EXTEND && - N->getOperand(0)->getOpcode() == ISD::SETCC); - const SDValue SetCC = N->getOperand(0); - - if (isCheapToExtend(SetCC.getOperand(0)) && - isCheapToExtend(SetCC.getOperand(1))) { - const SDValue Ext1 = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), - N->getValueType(0), SetCC.getOperand(0)); - const SDValue Ext2 = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), - N->getValueType(0), SetCC.getOperand(1)); - - return DAG.getSetCC( - SDLoc(SetCC), N->getValueType(0), Ext1, Ext2, - cast(SetCC->getOperand(2).getNode())->get()); - } - - return SDValue(); -} - static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -15390,11 +15356,6 @@ static SDValue performExtendCombine(SDNode *N, return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); } - - if (N->getOpcode() == ISD::SIGN_EXTEND && - N->getOperand(0)->getOpcode() == ISD::SETCC) - return performSignExtendSetCCCombine(N, DCI, DAG); - return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll index 6bb85a8..d3ad7df 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -265,10 +265,12 @@ define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>* define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl32 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 -; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <32 x i8>, <32 x i8>* %bp @@ -281,9 +283,12 @@ define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ldr q0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1sb { z0.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: cmeq v0.16b, v0.16b, #0 +; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: cmpne p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -297,9 +302,13 @@ define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ldr d0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1sb { z0.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: cmeq v0.8b, v0.8b, #0 +; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -313,10 +322,12 @@ define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <16 x i16>, <16 x i16>* %bp @@ -329,9 +340,12 @@ define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ldr q0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1sh { z0.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -345,10 +359,12 @@ define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, <8 x i32>* %bp @@ -361,10 +377,12 @@ define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl32 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 -; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <32 x i8>, <32 x i8>* %bp @@ -377,9 +395,12 @@ define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ldr q0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1sb { z0.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: cmeq v0.16b, v0.16b, #0 +; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: cmpne p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -393,9 +414,13 @@ define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ldr d0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1sb { z0.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: cmeq v0.8b, v0.8b, #0 +; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -409,10 +434,12 @@ define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <16 x i16>, <16 x i16>* %bp @@ -425,9 +452,12 @@ define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ldr q0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1sh { z0.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -441,10 +471,12 @@ define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, <8 x i32>* %bp @@ -649,10 +681,12 @@ define <8 x i64> @masked_load_zext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp) define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v128i8i16: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl128 +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 -; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p1.h, p0/z, z0.h, #0 -; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <128 x i8>, <128 x i8>* %bp @@ -665,10 +699,13 @@ define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v64i8i32: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl64 +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 -; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0 -; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <64 x i8>, <64 x i8>* %bp @@ -681,10 +718,14 @@ define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v32i8i64: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i8>, <32 x i8>* %bp @@ -697,10 +738,12 @@ define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v64i16i32: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 -; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0 -; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <64 x i16>, <64 x i16>* %bp @@ -713,10 +756,13 @@ define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v32i16i64: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i16>, <32 x i16>* %bp @@ -729,10 +775,12 @@ define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v32i32i64: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i32>, <32 x i32>* %bp @@ -745,10 +793,12 @@ define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v128i8i16: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl128 +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1b { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 -; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p1.h, p0/z, z0.h, #0 -; VBITS_GE_2048-NEXT: ld1b { z0.h }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <128 x i8>, <128 x i8>* %bp @@ -761,10 +811,13 @@ define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v64i8i32: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl64 +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1b { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 -; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0 -; VBITS_GE_2048-NEXT: ld1b { z0.s }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <64 x i8>, <64 x i8>* %bp @@ -777,10 +830,14 @@ define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v32i8i64: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1b { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i8>, <32 x i8>* %bp @@ -793,10 +850,12 @@ define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v64i16i32: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 -; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0 -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <64 x i16>, <64 x i16>* %bp @@ -809,10 +868,13 @@ define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v32i16i64: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i16>, <32 x i16>* %bp @@ -825,10 +887,12 @@ define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) define <32 x i64> @masked_load_zext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v32i32i64: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b +; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i32>, <32 x i32>* %bp -- 2.7.4