DAG.getNode(ISD::MUL, DL, VT, N01, N1));
}
+static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::v2i64)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ auto IsSignExt = [&](SDValue Op) {
+ if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
+ return SDValue();
+ EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
+ if (VT.getScalarSizeInBits() == 32)
+ return Op->getOperand(0);
+ return SDValue();
+ };
+ auto IsZeroExt = [&](SDValue Op) {
+ // Zero extends are a little more awkward. At the point we are matching
+ // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
+ // That might be before of after a bitcast depending on how the and is
+ // placed. Because this has to look through bitcasts, it is currently only
+ // supported on LE.
+ if (!Subtarget->isLittle())
+ return SDValue();
+
+ SDValue And = Op;
+ if (And->getOpcode() == ISD::BITCAST)
+ And = And->getOperand(0);
+ if (And->getOpcode() != ISD::AND)
+ return SDValue();
+ SDValue Mask = And->getOperand(1);
+ if (Mask->getOpcode() == ISD::BITCAST)
+ Mask = Mask->getOperand(0);
+
+ if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
+ Mask.getValueType() != MVT::v4i32)
+ return SDValue();
+ if (isAllOnesConstant(Mask->getOperand(0)) &&
+ isNullConstant(Mask->getOperand(1)) &&
+ isAllOnesConstant(Mask->getOperand(2)) &&
+ isNullConstant(Mask->getOperand(3)))
+ return And->getOperand(0);
+ return SDValue();
+ };
+
+ SDLoc dl(N);
+ if (SDValue Op0 = IsSignExt(N0)) {
+ if (SDValue Op1 = IsSignExt(N1)) {
+ SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
+ SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
+ return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
+ }
+ }
+ if (SDValue Op0 = IsZeroExt(N0)) {
+ if (SDValue Op1 = IsZeroExt(N1)) {
+ SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
+ SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
+ return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
+ }
+ }
+
+ return SDValue();
+}
+
static SDValue PerformMULCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
+ return PerformMVEVMULLCombine(N, DAG, Subtarget);
+
if (Subtarget->isThumb1Only())
return SDValue();
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
- EVT VT = N->getValueType(0);
if (VT.is64BitVector() || VT.is128BitVector())
return PerformVMULCombine(N, DCI, Subtarget);
if (VT != MVT::i32)
def ARMvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>;
def ARMvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>;
+def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisSameAs<1, 2>]>;
+def ARMvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
+def ARMvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
+
def SDTARMVCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
SDTCisInt<3>]>;
def SDTARMVCMPZ : SDTypeProfile<1, 2, [SDTCisInt<2>]>;
def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i8),
(MVE_VMOVLs16bh (MVE_VMOVLs8bh MQPR:$src))>;
- def : Pat<(sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))),
- v8i8), (MVE_VMOVLs8th MQPR:$src)>;
- def : Pat<(sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))),
- v4i16), (MVE_VMOVLs16th MQPR:$src)>;
- def : Pat<(ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))),
- (i32 0xAFF)), (MVE_VMOVLu8th MQPR:$src)>;
- def : Pat<(and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))),
- (v4i32 (ARMvmovImm (i32 0xCFF)))),
- (MVE_VMOVLu16th MQPR:$src)>;
+ def : Pat<(sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), v8i8),
+ (MVE_VMOVLs8th MQPR:$src)>;
+ def : Pat<(sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))), v4i16),
+ (MVE_VMOVLs16th MQPR:$src)>;
- // zext_inreg 16 -> 32
- def : Pat<(and (v4i32 MQPR:$src), (v4i32 (ARMvmovImm (i32 0xCFF)))),
- (MVE_VMOVLu16bh MQPR:$src)>;
// zext_inreg 8 -> 16
def : Pat<(ARMvbicImm (v8i16 MQPR:$src), (i32 0xAFF)),
(MVE_VMOVLu8bh MQPR:$src)>;
+ // zext_inreg 16 -> 32
+ def : Pat<(and (v4i32 MQPR:$src), (v4i32 (ARMvmovImm (i32 0xCFF)))),
+ (MVE_VMOVLu16bh MQPR:$src)>;
+ // Same zext_inreg with vrevs, picking the top half
+ def : Pat<(ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), (i32 0xAFF)),
+ (MVE_VMOVLu8th MQPR:$src)>;
+ def : Pat<(and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))),
+ (v4i32 (ARMvmovImm (i32 0xCFF)))),
+ (MVE_VMOVLu16th MQPR:$src)>;
}
defm MVE_VMULLTp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
int_arm_mve_mull_poly_predicated, 0b1>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v2i64 (ARMvmulls (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))),
+ (MVE_VMULLBs32 MQPR:$src1, MQPR:$src2)>;
+ def : Pat<(v2i64 (ARMvmulls (v4i32 (ARMvrev64 (v4i32 MQPR:$src1))),
+ (v4i32 (ARMvrev64 (v4i32 MQPR:$src2))))),
+ (MVE_VMULLTs32 MQPR:$src1, MQPR:$src2)>;
+
+ def : Pat<(mul (sext_inreg (v4i32 MQPR:$src1), v4i16),
+ (sext_inreg (v4i32 MQPR:$src2), v4i16)),
+ (MVE_VMULLBs16 MQPR:$src1, MQPR:$src2)>;
+ def : Pat<(mul (sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src1)))), v4i16),
+ (sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src2)))), v4i16)),
+ (MVE_VMULLTs16 MQPR:$src1, MQPR:$src2)>;
+
+ def : Pat<(mul (sext_inreg (v8i16 MQPR:$src1), v8i8),
+ (sext_inreg (v8i16 MQPR:$src2), v8i8)),
+ (MVE_VMULLBs8 MQPR:$src1, MQPR:$src2)>;
+ def : Pat<(mul (sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src1)))), v8i8),
+ (sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src2)))), v8i8)),
+ (MVE_VMULLTs8 MQPR:$src1, MQPR:$src2)>;
+
+ def : Pat<(v2i64 (ARMvmullu (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))),
+ (MVE_VMULLBu32 MQPR:$src1, MQPR:$src2)>;
+ def : Pat<(v2i64 (ARMvmullu (v4i32 (ARMvrev64 (v4i32 MQPR:$src1))),
+ (v4i32 (ARMvrev64 (v4i32 MQPR:$src2))))),
+ (MVE_VMULLTu32 MQPR:$src1, MQPR:$src2)>;
+
+ def : Pat<(mul (and (v4i32 MQPR:$src1), (v4i32 (ARMvmovImm (i32 0xCFF)))),
+ (and (v4i32 MQPR:$src2), (v4i32 (ARMvmovImm (i32 0xCFF))))),
+ (MVE_VMULLBu16 MQPR:$src1, MQPR:$src2)>;
+ def : Pat<(mul (and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src1)))),
+ (v4i32 (ARMvmovImm (i32 0xCFF)))),
+ (and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src2)))),
+ (v4i32 (ARMvmovImm (i32 0xCFF))))),
+ (MVE_VMULLTu16 MQPR:$src1, MQPR:$src2)>;
+
+ def : Pat<(mul (ARMvbicImm (v8i16 MQPR:$src1), (i32 0xAFF)),
+ (ARMvbicImm (v8i16 MQPR:$src2), (i32 0xAFF))),
+ (MVE_VMULLBu8 MQPR:$src1, MQPR:$src2)>;
+ def : Pat<(mul (ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src1)))), (i32 0xAFF)),
+ (ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src2)))), (i32 0xAFF))),
+ (MVE_VMULLTu8 MQPR:$src1, MQPR:$src2)>;
+}
+
class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size, bit round,
list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
def NEONuzp : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
-def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
- SDTCisSameAs<1, 2>]>;
-def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
-def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
-
def SDTARMVTBL1 : SDTypeProfile<1, 2, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>,
SDTCisVT<2, v8i8>]>;
def SDTARMVTBL2 : SDTypeProfile<1, 3, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>,
let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
DecoderNamespace = "NEONData" in {
defm VMULLs : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
- "vmull", "s", NEONvmulls, 1>;
+ "vmull", "s", ARMvmulls, 1>;
defm VMULLu : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
- "vmull", "u", NEONvmullu, 1>;
+ "vmull", "u", ARMvmullu, 1>;
def VMULLp8 : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
v8i16, v8i8, int_arm_neon_vmullp, 1>;
def VMULLp64 : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary,
"vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>,
Requires<[HasV8, HasCrypto]>;
}
-defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;
-defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;
+defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", ARMvmulls>;
+defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", ARMvmullu>;
// VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D)
defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D,
// VMLAL : Vector Multiply Accumulate Long (Q += D * D)
defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
- "vmlal", "s", NEONvmulls, add>;
+ "vmlal", "s", ARMvmulls, add>;
defm VMLALu : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
- "vmlal", "u", NEONvmullu, add>;
+ "vmlal", "u", ARMvmullu, add>;
-defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>;
-defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
+defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", ARMvmulls, add>;
+defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", ARMvmullu, add>;
let Predicates = [HasNEON, HasV8_1a] in {
// v8.1a Neon Rounding Double Multiply-Op vector operations,
// VMLSL : Vector Multiply Subtract Long (Q -= D * D)
defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
- "vmlsl", "s", NEONvmulls, sub>;
+ "vmlsl", "s", ARMvmulls, sub>;
defm VMLSLu : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
- "vmlsl", "u", NEONvmullu, sub>;
+ "vmlsl", "u", ARMvmullu, sub>;
-defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>;
-defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;
+defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", ARMvmulls, sub>;
+defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", ARMvmullu, sub>;
// VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-LABEL: add_v2i32_v2i64_zext:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: vmov r2, s6
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: umull r0, r1, r1, r0
-; CHECK-NEXT: umlal r0, r1, r3, r2
+; CHECK-NEXT: vmullb.u32 q2, q0, q1
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov r3, s8
+; CHECK-NEXT: vmov r1, s11
+; CHECK-NEXT: vmov r2, s9
+; CHECK-NEXT: adds r0, r0, r3
+; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: bx lr
entry:
%xx = zext <2 x i32> %x to <2 x i64>
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-LABEL: add_v2i32_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: vmov r2, s6
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: smull r0, r1, r1, r0
-; CHECK-NEXT: smlal r0, r1, r3, r2
+; CHECK-NEXT: vmullb.s32 q2, q0, q1
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov r3, s8
+; CHECK-NEXT: vmov r1, s11
+; CHECK-NEXT: vmov r2, s9
+; CHECK-NEXT: adds r0, r0, r3
+; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i32> %x to <2 x i64>
; CHECK-NEXT: vmov.16 q3[6], r0
; CHECK-NEXT: vmov.u8 r0, q0[15]
; CHECK-NEXT: vmov.16 q3[7], r0
-; CHECK-NEXT: vmovlb.u8 q2, q2
-; CHECK-NEXT: vmovlb.u8 q3, q3
; CHECK-NEXT: vmov.u8 r0, q1[0]
-; CHECK-NEXT: vmul.i16 q2, q3, q2
+; CHECK-NEXT: vmullb.u8 q2, q3, q2
; CHECK-NEXT: vmov.16 q3[0], r0
; CHECK-NEXT: vmov.u8 r0, q1[1]
; CHECK-NEXT: vmov.16 q3[1], r0
; CHECK-NEXT: vmov.u8 r0, q1[7]
; CHECK-NEXT: vmov.16 q3[7], r0
; CHECK-NEXT: vmov.u8 r0, q0[0]
-; CHECK-NEXT: vmovlb.u8 q1, q3
-; CHECK-NEXT: vmov.16 q3[0], r0
+; CHECK-NEXT: vmov.16 q1[0], r0
; CHECK-NEXT: vmov.u8 r0, q0[1]
-; CHECK-NEXT: vmov.16 q3[1], r0
+; CHECK-NEXT: vmov.16 q1[1], r0
; CHECK-NEXT: vmov.u8 r0, q0[2]
-; CHECK-NEXT: vmov.16 q3[2], r0
+; CHECK-NEXT: vmov.16 q1[2], r0
; CHECK-NEXT: vmov.u8 r0, q0[3]
-; CHECK-NEXT: vmov.16 q3[3], r0
+; CHECK-NEXT: vmov.16 q1[3], r0
; CHECK-NEXT: vmov.u8 r0, q0[4]
-; CHECK-NEXT: vmov.16 q3[4], r0
+; CHECK-NEXT: vmov.16 q1[4], r0
; CHECK-NEXT: vmov.u8 r0, q0[5]
-; CHECK-NEXT: vmov.16 q3[5], r0
+; CHECK-NEXT: vmov.16 q1[5], r0
; CHECK-NEXT: vmov.u8 r0, q0[6]
-; CHECK-NEXT: vmov.16 q3[6], r0
+; CHECK-NEXT: vmov.16 q1[6], r0
; CHECK-NEXT: vmov.u8 r0, q0[7]
-; CHECK-NEXT: vmov.16 q3[7], r0
-; CHECK-NEXT: vmovlb.u8 q0, q3
-; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmov.16 q1[7], r0
+; CHECK-NEXT: vmullb.u8 q0, q1, q3
; CHECK-NEXT: vadd.i16 q0, q0, q2
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: vmov.16 q3[6], r0
; CHECK-NEXT: vmov.u8 r0, q0[15]
; CHECK-NEXT: vmov.16 q3[7], r0
-; CHECK-NEXT: vmovlb.s8 q2, q2
-; CHECK-NEXT: vmovlb.s8 q3, q3
; CHECK-NEXT: vmov.u8 r0, q1[0]
-; CHECK-NEXT: vmul.i16 q2, q3, q2
+; CHECK-NEXT: vmullb.s8 q2, q3, q2
; CHECK-NEXT: vmov.16 q3[0], r0
; CHECK-NEXT: vmov.u8 r0, q1[1]
; CHECK-NEXT: vmov.16 q3[1], r0
; CHECK-NEXT: vmov.u8 r0, q1[7]
; CHECK-NEXT: vmov.16 q3[7], r0
; CHECK-NEXT: vmov.u8 r0, q0[0]
-; CHECK-NEXT: vmovlb.s8 q1, q3
-; CHECK-NEXT: vmov.16 q3[0], r0
+; CHECK-NEXT: vmov.16 q1[0], r0
; CHECK-NEXT: vmov.u8 r0, q0[1]
-; CHECK-NEXT: vmov.16 q3[1], r0
+; CHECK-NEXT: vmov.16 q1[1], r0
; CHECK-NEXT: vmov.u8 r0, q0[2]
-; CHECK-NEXT: vmov.16 q3[2], r0
+; CHECK-NEXT: vmov.16 q1[2], r0
; CHECK-NEXT: vmov.u8 r0, q0[3]
-; CHECK-NEXT: vmov.16 q3[3], r0
+; CHECK-NEXT: vmov.16 q1[3], r0
; CHECK-NEXT: vmov.u8 r0, q0[4]
-; CHECK-NEXT: vmov.16 q3[4], r0
+; CHECK-NEXT: vmov.16 q1[4], r0
; CHECK-NEXT: vmov.u8 r0, q0[5]
-; CHECK-NEXT: vmov.16 q3[5], r0
+; CHECK-NEXT: vmov.16 q1[5], r0
; CHECK-NEXT: vmov.u8 r0, q0[6]
-; CHECK-NEXT: vmov.16 q3[6], r0
+; CHECK-NEXT: vmov.16 q1[6], r0
; CHECK-NEXT: vmov.u8 r0, q0[7]
-; CHECK-NEXT: vmov.16 q3[7], r0
-; CHECK-NEXT: vmovlb.s8 q0, q3
-; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmov.16 q1[7], r0
+; CHECK-NEXT: vmullb.s8 q0, q1, q3
; CHECK-NEXT: vadd.i16 q0, q0, q2
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: sxth r0, r0
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-LABEL: add_v8i8_v8i16_zext:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovlb.u8 q1, q1
-; CHECK-NEXT: vmovlb.u8 q0, q0
-; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmullb.u8 q0, q0, q1
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-LABEL: add_v8i8_v8i16_sext:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovlb.s8 q1, q1
-; CHECK-NEXT: vmovlb.s8 q0, q0
-; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmullb.s8 q0, q0, q1
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vmov r3, s0
-; CHECK-NEXT: vmov r12, s6
-; CHECK-NEXT: umull r2, lr, r3, r2
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: umlal r2, lr, r3, r12
+; CHECK-NEXT: vmullb.u32 q2, q0, q1
+; CHECK-NEXT: vmov r2, s10
+; CHECK-NEXT: vmov r3, s8
+; CHECK-NEXT: vmov r12, s11
+; CHECK-NEXT: vmov lr, s9
+; CHECK-NEXT: adds r2, r2, r3
+; CHECK-NEXT: adc.w r3, lr, r12
; CHECK-NEXT: adds r0, r0, r2
-; CHECK-NEXT: adc.w r1, r1, lr
+; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = zext <2 x i32> %x to <2 x i64>
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vmov r3, s0
-; CHECK-NEXT: vmov r12, s6
-; CHECK-NEXT: smull r2, lr, r3, r2
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: smlal r2, lr, r3, r12
+; CHECK-NEXT: vmullb.s32 q2, q0, q1
+; CHECK-NEXT: vmov r2, s10
+; CHECK-NEXT: vmov r3, s8
+; CHECK-NEXT: vmov r12, s11
+; CHECK-NEXT: vmov lr, s9
+; CHECK-NEXT: adds r2, r2, r3
+; CHECK-NEXT: adc.w r3, lr, r12
; CHECK-NEXT: adds r0, r0, r2
-; CHECK-NEXT: adc.w r1, r1, lr
+; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = sext <2 x i32> %x to <2 x i64>
; CHECK-NEXT: vmov.16 q3[6], r1
; CHECK-NEXT: vmov.u8 r1, q0[15]
; CHECK-NEXT: vmov.16 q3[7], r1
-; CHECK-NEXT: vmovlb.u8 q2, q2
-; CHECK-NEXT: vmovlb.u8 q3, q3
; CHECK-NEXT: vmov.u8 r1, q1[0]
-; CHECK-NEXT: vmul.i16 q2, q3, q2
+; CHECK-NEXT: vmullb.u8 q2, q3, q2
; CHECK-NEXT: vmov.16 q3[0], r1
; CHECK-NEXT: vmov.u8 r1, q1[1]
; CHECK-NEXT: vmov.16 q3[1], r1
; CHECK-NEXT: vmov.u8 r1, q1[7]
; CHECK-NEXT: vmov.16 q3[7], r1
; CHECK-NEXT: vmov.u8 r1, q0[0]
-; CHECK-NEXT: vmovlb.u8 q1, q3
-; CHECK-NEXT: vmov.16 q3[0], r1
+; CHECK-NEXT: vmov.16 q1[0], r1
; CHECK-NEXT: vmov.u8 r1, q0[1]
-; CHECK-NEXT: vmov.16 q3[1], r1
+; CHECK-NEXT: vmov.16 q1[1], r1
; CHECK-NEXT: vmov.u8 r1, q0[2]
-; CHECK-NEXT: vmov.16 q3[2], r1
+; CHECK-NEXT: vmov.16 q1[2], r1
; CHECK-NEXT: vmov.u8 r1, q0[3]
-; CHECK-NEXT: vmov.16 q3[3], r1
+; CHECK-NEXT: vmov.16 q1[3], r1
; CHECK-NEXT: vmov.u8 r1, q0[4]
-; CHECK-NEXT: vmov.16 q3[4], r1
+; CHECK-NEXT: vmov.16 q1[4], r1
; CHECK-NEXT: vmov.u8 r1, q0[5]
-; CHECK-NEXT: vmov.16 q3[5], r1
+; CHECK-NEXT: vmov.16 q1[5], r1
; CHECK-NEXT: vmov.u8 r1, q0[6]
-; CHECK-NEXT: vmov.16 q3[6], r1
+; CHECK-NEXT: vmov.16 q1[6], r1
; CHECK-NEXT: vmov.u8 r1, q0[7]
-; CHECK-NEXT: vmov.16 q3[7], r1
-; CHECK-NEXT: vmovlb.u8 q0, q3
-; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmov.16 q1[7], r1
+; CHECK-NEXT: vmullb.u8 q0, q1, q3
; CHECK-NEXT: vadd.i16 q0, q0, q2
; CHECK-NEXT: vaddva.u16 r0, q0
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: vmov.16 q3[6], r1
; CHECK-NEXT: vmov.u8 r1, q0[15]
; CHECK-NEXT: vmov.16 q3[7], r1
-; CHECK-NEXT: vmovlb.s8 q2, q2
-; CHECK-NEXT: vmovlb.s8 q3, q3
; CHECK-NEXT: vmov.u8 r1, q1[0]
-; CHECK-NEXT: vmul.i16 q2, q3, q2
+; CHECK-NEXT: vmullb.s8 q2, q3, q2
; CHECK-NEXT: vmov.16 q3[0], r1
; CHECK-NEXT: vmov.u8 r1, q1[1]
; CHECK-NEXT: vmov.16 q3[1], r1
; CHECK-NEXT: vmov.u8 r1, q1[7]
; CHECK-NEXT: vmov.16 q3[7], r1
; CHECK-NEXT: vmov.u8 r1, q0[0]
-; CHECK-NEXT: vmovlb.s8 q1, q3
-; CHECK-NEXT: vmov.16 q3[0], r1
+; CHECK-NEXT: vmov.16 q1[0], r1
; CHECK-NEXT: vmov.u8 r1, q0[1]
-; CHECK-NEXT: vmov.16 q3[1], r1
+; CHECK-NEXT: vmov.16 q1[1], r1
; CHECK-NEXT: vmov.u8 r1, q0[2]
-; CHECK-NEXT: vmov.16 q3[2], r1
+; CHECK-NEXT: vmov.16 q1[2], r1
; CHECK-NEXT: vmov.u8 r1, q0[3]
-; CHECK-NEXT: vmov.16 q3[3], r1
+; CHECK-NEXT: vmov.16 q1[3], r1
; CHECK-NEXT: vmov.u8 r1, q0[4]
-; CHECK-NEXT: vmov.16 q3[4], r1
+; CHECK-NEXT: vmov.16 q1[4], r1
; CHECK-NEXT: vmov.u8 r1, q0[5]
-; CHECK-NEXT: vmov.16 q3[5], r1
+; CHECK-NEXT: vmov.16 q1[5], r1
; CHECK-NEXT: vmov.u8 r1, q0[6]
-; CHECK-NEXT: vmov.16 q3[6], r1
+; CHECK-NEXT: vmov.16 q1[6], r1
; CHECK-NEXT: vmov.u8 r1, q0[7]
-; CHECK-NEXT: vmov.16 q3[7], r1
-; CHECK-NEXT: vmovlb.s8 q0, q3
-; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmov.16 q1[7], r1
+; CHECK-NEXT: vmullb.s8 q0, q1, q3
; CHECK-NEXT: vadd.i16 q0, q0, q2
; CHECK-NEXT: vaddva.u16 r0, q0
; CHECK-NEXT: sxth r0, r0
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovlb.u8 q1, q1
-; CHECK-NEXT: vmovlb.u8 q0, q0
-; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmullb.u8 q0, q0, q1
; CHECK-NEXT: vaddva.u16 r0, q0
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovlb.s8 q1, q1
-; CHECK-NEXT: vmovlb.s8 q0, q0
-; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmullb.s8 q0, q0, q1
; CHECK-NEXT: vaddva.u16 r0, q0
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r5, lr}
; CHECK-NEXT: push {r5, lr}
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: cmp r3, #1
-; CHECK-NEXT: blt .LBB0_2
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r5, pc}
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q2, [r0], #16
-; CHECK-NEXT: vldrw.u32 q3, [r1], #16
+; CHECK-NEXT: vldrw.u32 q1, [r0], #16
+; CHECK-NEXT: vldrw.u32 q2, [r1], #16
; CHECK-NEXT: subs r3, #4
-; CHECK-NEXT: vrev64.32 q1, q2
-; CHECK-NEXT: vrev64.32 q4, q3
-; CHECK-NEXT: vmov r12, s4
-; CHECK-NEXT: vmov lr, s16
-; CHECK-NEXT: smull r12, r5, lr, r12
+; CHECK-NEXT: vmullt.s32 q3, q2, q1
+; CHECK-NEXT: vmov r5, s13
+; CHECK-NEXT: vmov r12, s12
; CHECK-NEXT: lsrl r12, r5, #31
; CHECK-NEXT: vmov.32 q0[0], r12
-; CHECK-NEXT: vmov r12, s6
+; CHECK-NEXT: vmov r12, s14
; CHECK-NEXT: vmov.32 q0[1], r5
-; CHECK-NEXT: vmov r5, s18
-; CHECK-NEXT: smull r12, r5, r5, r12
+; CHECK-NEXT: vmov r5, s15
; CHECK-NEXT: lsrl r12, r5, #31
+; CHECK-NEXT: vmullb.s32 q3, q2, q1
; CHECK-NEXT: vmov.32 q0[2], r12
-; CHECK-NEXT: vmov r12, s8
+; CHECK-NEXT: vmov r12, s12
; CHECK-NEXT: vmov.32 q0[3], r5
-; CHECK-NEXT: vmov r5, s12
-; CHECK-NEXT: smull r12, r5, r5, r12
+; CHECK-NEXT: vmov r5, s13
; CHECK-NEXT: lsrl r12, r5, #31
; CHECK-NEXT: vmov.32 q1[0], r12
-; CHECK-NEXT: vmov r12, s10
+; CHECK-NEXT: vmov r12, s14
; CHECK-NEXT: vmov.32 q1[1], r5
-; CHECK-NEXT: vmov r5, s14
-; CHECK-NEXT: smull r12, r5, r5, r12
+; CHECK-NEXT: vmov r5, s15
; CHECK-NEXT: lsrl r12, r5, #31
; CHECK-NEXT: vmov.32 q1[2], r12
; CHECK-NEXT: vmov.32 q1[3], r5
; CHECK-NEXT: vmov.f32 s7, s10
; CHECK-NEXT: vstrb.8 q1, [r2], #16
; CHECK-NEXT: bne .LBB0_1
-; CHECK-NEXT: .LBB0_2: @ %for.cond.cleanup
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r5, pc}
entry:
%0 = and i32 %n, 3
; CHECK-NEXT: .LBB1_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
-; CHECK-NEXT: vldrh.u16 q2, [r1], #16
+; CHECK-NEXT: vldrh.u16 q1, [r1], #16
; CHECK-NEXT: subs r3, #8
-; CHECK-NEXT: vmovlt.s16 q1, q0
-; CHECK-NEXT: vmovlt.s16 q3, q2
-; CHECK-NEXT: vmovlb.s16 q0, q0
-; CHECK-NEXT: vmovlb.s16 q2, q2
-; CHECK-NEXT: vmul.i32 q1, q3, q1
-; CHECK-NEXT: vmul.i32 q0, q2, q0
-; CHECK-NEXT: vshr.u32 q1, q1, #15
+; CHECK-NEXT: vmullt.s16 q2, q1, q0
+; CHECK-NEXT: vmullb.s16 q0, q1, q0
+; CHECK-NEXT: vshr.u32 q2, q2, #15
; CHECK-NEXT: vshr.u32 q0, q0, #15
-; CHECK-NEXT: vmovnt.i32 q0, q1
+; CHECK-NEXT: vmovnt.i32 q0, q2
; CHECK-NEXT: vstrb.8 q0, [r2], #16
; CHECK-NEXT: bne .LBB1_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: .LBB2_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u8 q0, [r0], #16
-; CHECK-NEXT: vldrb.u8 q2, [r1], #16
+; CHECK-NEXT: vldrb.u8 q1, [r1], #16
; CHECK-NEXT: subs r3, #16
-; CHECK-NEXT: vmovlt.u8 q1, q0
-; CHECK-NEXT: vmovlt.u8 q3, q2
-; CHECK-NEXT: vmovlb.u8 q0, q0
-; CHECK-NEXT: vmovlb.u8 q2, q2
-; CHECK-NEXT: vmul.i16 q1, q3, q1
-; CHECK-NEXT: vmul.i16 q0, q2, q0
-; CHECK-NEXT: vshr.u16 q1, q1, #7
+; CHECK-NEXT: vmullt.u8 q2, q1, q0
+; CHECK-NEXT: vmullb.u8 q0, q1, q0
+; CHECK-NEXT: vshr.u16 q2, q2, #7
; CHECK-NEXT: vshr.u16 q0, q0, #7
-; CHECK-NEXT: vmovnt.i16 q0, q1
+; CHECK-NEXT: vmovnt.i16 q0, q2
; CHECK-NEXT: vstrb.8 q0, [r2], #16
; CHECK-NEXT: bne .LBB2_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
define arm_aapcs_vfpcc <2 x i64> @sext_02(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: sext_02:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: smull r0, r1, r1, r0
-; CHECK-NEXT: vmov.32 q2[0], r0
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov.32 q2[1], r1
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: smull r0, r1, r1, r0
-; CHECK-NEXT: vmov.32 q2[2], r0
-; CHECK-NEXT: vmov.32 q2[3], r1
+; CHECK-NEXT: vmullb.s32 q2, q0, q1
; CHECK-NEXT: vmov q0, q2
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc <2 x i64> @sext_13(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: sext_13:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vrev64.32 q2, q1
-; CHECK-NEXT: vrev64.32 q1, q0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vmov r1, s4
-; CHECK-NEXT: smull r0, r1, r1, r0
-; CHECK-NEXT: vmov.32 q0[0], r0
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: vmov.32 q0[1], r1
-; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: smull r0, r1, r1, r0
-; CHECK-NEXT: vmov.32 q0[2], r0
-; CHECK-NEXT: vmov.32 q0[3], r1
+; CHECK-NEXT: vmullt.s32 q2, q0, q1
+; CHECK-NEXT: vmov q0, q2
; CHECK-NEXT: bx lr
entry:
%shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
define arm_aapcs_vfpcc <2 x i64> @zext_02(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: zext_02:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: umull r0, r1, r1, r0
-; CHECK-NEXT: vmov.32 q2[0], r0
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov.32 q2[1], r1
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: umull r0, r1, r1, r0
-; CHECK-NEXT: vmov.32 q2[2], r0
-; CHECK-NEXT: vmov.32 q2[3], r1
+; CHECK-NEXT: vmullb.u32 q2, q0, q1
; CHECK-NEXT: vmov q0, q2
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc <2 x i64> @zext_13(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: zext_13:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vrev64.32 q2, q1
-; CHECK-NEXT: vrev64.32 q1, q0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vmov r1, s4
-; CHECK-NEXT: umull r0, r1, r1, r0
-; CHECK-NEXT: vmov.32 q0[0], r0
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: vmov.32 q0[1], r1
-; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: umull r0, r1, r1, r0
-; CHECK-NEXT: vmov.32 q0[2], r0
-; CHECK-NEXT: vmov.32 q0[3], r1
+; CHECK-NEXT: vmullt.u32 q2, q0, q1
+; CHECK-NEXT: vmov q0, q2
; CHECK-NEXT: bx lr
entry:
%shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
define arm_aapcs_vfpcc <4 x i32> @sext_0246(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: sext_0246:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovlb.s16 q1, q1
-; CHECK-NEXT: vmovlb.s16 q0, q0
-; CHECK-NEXT: vmul.i32 q0, q0, q1
+; CHECK-NEXT: vmullb.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
define arm_aapcs_vfpcc <4 x i32> @sext_1357(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: sext_1357:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovlt.s16 q1, q1
-; CHECK-NEXT: vmovlt.s16 q0, q0
-; CHECK-NEXT: vmul.i32 q0, q0, q1
+; CHECK-NEXT: vmullt.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
define arm_aapcs_vfpcc <4 x i32> @zext_0246(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: zext_0246:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovlb.u16 q1, q1
-; CHECK-NEXT: vmovlb.u16 q0, q0
-; CHECK-NEXT: vmul.i32 q0, q0, q1
+; CHECK-NEXT: vmullb.u16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
define arm_aapcs_vfpcc <4 x i32> @zext_1357(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: zext_1357:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovlt.u16 q1, q1
-; CHECK-NEXT: vmovlt.u16 q0, q0
-; CHECK-NEXT: vmul.i32 q0, q0, q1
+; CHECK-NEXT: vmullt.u16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
define arm_aapcs_vfpcc <8 x i16> @sext_02468101214(<16 x i8> %src1, <16 x i8> %src2) {
; CHECK-LABEL: sext_02468101214:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovlb.s8 q1, q1
-; CHECK-NEXT: vmovlb.s8 q0, q0
-; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmullb.s8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
define arm_aapcs_vfpcc <8 x i16> @sext_13579111315(<16 x i8> %src1, <16 x i8> %src2) {
; CHECK-LABEL: sext_13579111315:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovlt.s8 q1, q1
-; CHECK-NEXT: vmovlt.s8 q0, q0
-; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmullt.s8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
define arm_aapcs_vfpcc <8 x i16> @zext_02468101214(<16 x i8> %src1, <16 x i8> %src2) {
; CHECK-LABEL: zext_02468101214:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovlb.u8 q1, q1
-; CHECK-NEXT: vmovlb.u8 q0, q0
-; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmullb.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
define arm_aapcs_vfpcc <8 x i16> @zext_13579111315(<16 x i8> %src1, <16 x i8> %src2) {
; CHECK-LABEL: zext_13579111315:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovlt.u8 q1, q1
-; CHECK-NEXT: vmovlt.u8 q0, q0
-; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmullt.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>