From: David Green Date: Wed, 22 Jul 2020 13:08:29 +0000 (+0100) Subject: [ARM] Extra MVE select(binop) patterns X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f8abecf3379de841c436f353f060929722af8602;p=platform%2Fupstream%2Fllvm.git [ARM] Extra MVE select(binop) patterns This is very similar to 243970d03cace2, but handling a slightly different form of predicated operations. When starting with a pattern of the form select(p, BinOp(x, y), x), Instcombine will often transform this to BinOp(x, select(p, y, 0)), where 0 is the identity value of the binop (0 for adds/subs, 1 for muls, -1 for ands etc). This adds the patterns that transforms those back into predicated binary operations. There is also a very minor adjustment to tablegen null_frag in here, to allow it to also be recognized as a PatLeaf node, so that it can be used in MVE_TwoOpPattern to easily exclude the cases where we do not need the alternate transform. Differential Revision: https://reviews.llvm.org/D84091 --- diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index da0a836..db2ab7a 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -498,6 +498,18 @@ def SubReg_i32_lane : SDNodeXForm; +def ARMimmAllZerosV: PatLeaf<(bitconvert (v4i32 (ARMvmovImm (i32 0))))>; +def ARMimmAllZerosD: PatLeaf<(bitconvert (v2i32 (ARMvmovImm (i32 0))))>; +def ARMimmAllOnesV: PatLeaf<(bitconvert (v16i8 (ARMvmovImm (i32 0xEFF))))>; +def ARMimmAllOnesD: PatLeaf<(bitconvert (v8i8 (ARMvmovImm (i32 0xEFF))))>; + +def ARMimmOneV: PatLeaf<(ARMvmovImm (i32 timm)), [{ + ConstantSDNode *ConstVal = cast(N->getOperand(0)); + unsigned EltBits = 0; + uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits); + return (EltBits == N->getValueType(0).getScalarSizeInBits() && EltVal == 0x01); +}]>; + //===----------------------------------------------------------------------===// // Operand Definitions. diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 95f41ab..b8c651a 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -318,9 +318,9 @@ def MVE_v2f64 : MVEVectorVTInfo; def MVE_v16p8 : MVEVectorVTInfo; def MVE_v8p16 : MVEVectorVTInfo; - multiclass MVE_TwoOpPattern { + dag PredOperands, Instruction Inst, + SDPatternOperator IdentityVec = null_frag> { // Unpredicated def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; @@ -334,6 +334,15 @@ multiclass MVE_TwoOpPattern; + + // Optionally with the select folded through the op + def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), + (VTI.Vec (vselect (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$Qn), + (VTI.Vec IdentityVec))))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$Qm)))>; } // Predicated with intrinsic @@ -346,7 +355,8 @@ multiclass MVE_TwoOpPattern { + dag PredOperands, Instruction Inst, + SDPatternOperator IdentityVec = null_frag> { // Unpredicated def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), (VTI.Vec (ARMvdup rGPR:$Rn)))), (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn))>; @@ -360,6 +370,15 @@ multiclass MVE_TwoOpPatternDup; + + // Optionally with the select folded through the op + def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), + (VTI.Vec (vselect (VTI.Pred VCCR:$mask), + (ARMvdup rGPR:$Rn), + (VTI.Vec IdentityVec))))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn, + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$Qm)))>; } // Predicated with intrinsic @@ -1492,20 +1511,20 @@ foreach s=["s8", "s16", "s32", "u8", "u16", "u32", "i8", "i16", "i32", "f16", "f } let Predicates = [HasMVEInt] in { - defm : MVE_TwoOpPattern; - defm : MVE_TwoOpPattern; - defm : MVE_TwoOpPattern; - defm : MVE_TwoOpPattern; + defm : MVE_TwoOpPattern; + defm : MVE_TwoOpPattern; + defm : MVE_TwoOpPattern; + defm : MVE_TwoOpPattern; - defm : MVE_TwoOpPattern; - defm : MVE_TwoOpPattern; - defm : MVE_TwoOpPattern; - defm : MVE_TwoOpPattern; + defm : MVE_TwoOpPattern; + defm : MVE_TwoOpPattern; + defm : MVE_TwoOpPattern; + defm : MVE_TwoOpPattern; - defm : MVE_TwoOpPattern; - defm : MVE_TwoOpPattern; - defm : MVE_TwoOpPattern; - defm : MVE_TwoOpPattern; + defm : MVE_TwoOpPattern; + defm : MVE_TwoOpPattern; + defm : MVE_TwoOpPattern; + defm : MVE_TwoOpPattern; defm : MVE_TwoOpPattern, int_arm_mve_bic_predicated, (? ), MVE_VBIC>; @@ -1775,7 +1794,7 @@ multiclass MVE_VMUL_m { let Predicates = [HasMVEInt] in { defm : MVE_TwoOpPattern(NAME)>; + !cast(NAME), ARMimmOneV>; } } @@ -1849,7 +1868,7 @@ multiclass MVE_VADDSUB_m(NAME); let Predicates = [HasMVEInt] in { - defm : MVE_TwoOpPattern(NAME)>; + defm : MVE_TwoOpPattern(NAME), ARMimmAllZerosV>; } } @@ -4984,7 +5003,7 @@ multiclass MVE_VADDSUB_qr_m { def "" : MVE_VADDSUB_qr; let Predicates = [HasMVEInt] in { - defm : MVE_TwoOpPatternDup(NAME)>; + defm : MVE_TwoOpPatternDup(NAME), ARMimmAllZerosV>; } } @@ -5270,7 +5289,7 @@ class MVE_VMUL_qr_int size> multiclass MVE_VMUL_qr_int_m { def "" : MVE_VMUL_qr_int<"vmul", VTI.Suffix, VTI.Size>; defm : MVE_TwoOpPatternDup(NAME)>; + !cast(NAME), ARMimmOneV>; } defm MVE_VMUL_qr_i8 : MVE_VMUL_qr_int_m; @@ -6864,7 +6883,7 @@ class MVE_vector_load_typed - : Pat<(Ty (LoadKind t2addrmode_imm7:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))), + : Pat<(Ty (LoadKind t2addrmode_imm7:$addr, VCCR:$pred, (Ty (ARMvmovImm (i32 0))))), (Ty (RegImmInst t2addrmode_imm7:$addr, ARMVCCThen, VCCR:$pred))>; multiclass MVE_vector_load:$addr))>; // Masked ext loads - def : Pat<(VT (!cast("aligned_extmaskedload"#Amble) taddrmode_imm7:$addr, VCCR:$pred, (VT NEONimmAllZerosV))), + def : Pat<(VT (!cast("aligned_extmaskedload"#Amble) taddrmode_imm7:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))), (VT (LoadUInst taddrmode_imm7:$addr, ARMVCCThen, VCCR:$pred))>; - def : Pat<(VT (!cast("aligned_sextmaskedload"#Amble) taddrmode_imm7:$addr, VCCR:$pred, (VT NEONimmAllZerosV))), + def : Pat<(VT (!cast("aligned_sextmaskedload"#Amble) taddrmode_imm7:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))), (VT (LoadSInst taddrmode_imm7:$addr, ARMVCCThen, VCCR:$pred))>; - def : Pat<(VT (!cast("aligned_zextmaskedload"#Amble) taddrmode_imm7:$addr, VCCR:$pred, (VT NEONimmAllZerosV))), + def : Pat<(VT (!cast("aligned_zextmaskedload"#Amble) taddrmode_imm7:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))), (VT (LoadUInst taddrmode_imm7:$addr, ARMVCCThen, VCCR:$pred))>; } diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index c097a4a..b05a388 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -534,20 +534,6 @@ def NEONvtbl1 : SDNode<"ARMISD::VTBL1", SDTARMVTBL1>; def NEONvtbl2 : SDNode<"ARMISD::VTBL2", SDTARMVTBL2>; -def NEONimmAllZerosV: PatLeaf<(ARMvmovImm (i32 timm)), [{ - ConstantSDNode *ConstVal = cast(N->getOperand(0)); - unsigned EltBits = 0; - uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits); - return (EltBits == 32 && EltVal == 0); -}]>; - -def NEONimmAllOnesV: PatLeaf<(ARMvmovImm (i32 timm)), [{ - ConstantSDNode *ConstVal = cast(N->getOperand(0)); - unsigned EltBits = 0; - uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits); - return (EltBits == 8 && EltVal == 0xff); -}]>; - //===----------------------------------------------------------------------===// // NEON load / store instructions //===----------------------------------------------------------------------===// @@ -5273,9 +5259,9 @@ def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm", // Vector Bitwise Operations. def vnotd : PatFrag<(ops node:$in), - (xor node:$in, (bitconvert (v8i8 NEONimmAllOnesV)))>; + (xor node:$in, ARMimmAllOnesD)>; def vnotq : PatFrag<(ops node:$in), - (xor node:$in, (bitconvert (v16i8 NEONimmAllOnesV)))>; + (xor node:$in, ARMimmAllOnesV)>; // VAND : Vector Bitwise AND @@ -6054,9 +6040,9 @@ defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, // Vector Negate. def vnegd : PatFrag<(ops node:$in), - (sub (bitconvert (v2i32 NEONimmAllZerosV)), node:$in)>; + (sub ARMimmAllZerosD, node:$in)>; def vnegq : PatFrag<(ops node:$in), - (sub (bitconvert (v4i32 NEONimmAllZerosV)), node:$in)>; + (sub ARMimmAllZerosV, node:$in)>; class VNEGD size, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm), @@ -6270,11 +6256,11 @@ defm : NEONImmReplicateInstAlias, Requires<[HasZCZ]>; def VMOVQ0 : ARMPseudoExpand<(outs QPR:$Vd), (ins), 4, IIC_VMOVImm, - [(set QPR:$Vd, (v4i32 NEONimmAllZerosV))], + [(set QPR:$Vd, (v4i32 ARMimmAllZerosV))], (VMOVv4i32 QPR:$Vd, 0, (ops 14, zero_reg))>, Requires<[HasZCZ]>; } diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll b/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll index 00b6b8b..df92d30 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll @@ -4,10 +4,9 @@ define arm_aapcs_vfpcc <4 x i32> @add_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) { ; CHECK-LABEL: add_v4i32_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -19,10 +18,9 @@ entry: define arm_aapcs_vfpcc <8 x i16> @add_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) { ; CHECK-LABEL: add_v8i16_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -34,10 +32,9 @@ entry: define arm_aapcs_vfpcc <16 x i8> @add_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) { ; CHECK-LABEL: add_v16i8_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vadd.i8 q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -49,10 +46,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @sub_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) { ; CHECK-LABEL: sub_v4i32_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vsub.i32 q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsubt.i32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -64,10 +60,9 @@ entry: define arm_aapcs_vfpcc <8 x i16> @sub_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) { ; CHECK-LABEL: sub_v8i16_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vsub.i16 q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsubt.i16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -79,10 +74,9 @@ entry: define arm_aapcs_vfpcc <16 x i8> @sub_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) { ; CHECK-LABEL: sub_v16i8_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vsub.i8 q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsubt.i8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -94,10 +88,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @mul_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) { ; CHECK-LABEL: mul_v4i32_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x1 ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -109,10 +102,9 @@ entry: define arm_aapcs_vfpcc <8 x i16> @mul_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) { ; CHECK-LABEL: mul_v8i16_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i16 q2, #0x1 ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vmul.i16 q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -124,10 +116,9 @@ entry: define arm_aapcs_vfpcc <16 x i8> @mul_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) { ; CHECK-LABEL: mul_v16i8_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i8 q2, #0x1 ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vmul.i8 q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -139,10 +130,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @and_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) { ; CHECK-LABEL: and_v4i32_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vand q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vandt q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -154,10 +144,9 @@ entry: define arm_aapcs_vfpcc <8 x i16> @and_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) { ; CHECK-LABEL: and_v8i16_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vand q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vandt q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -169,10 +158,9 @@ entry: define arm_aapcs_vfpcc <16 x i8> @and_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) { ; CHECK-LABEL: and_v16i8_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vand q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vandt q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -184,10 +172,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @or_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) { ; CHECK-LABEL: or_v4i32_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vorrt q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -199,10 +186,9 @@ entry: define arm_aapcs_vfpcc <8 x i16> @or_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) { ; CHECK-LABEL: or_v8i16_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vorrt q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -214,10 +200,9 @@ entry: define arm_aapcs_vfpcc <16 x i8> @or_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) { ; CHECK-LABEL: or_v16i8_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vorrt q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -229,10 +214,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @xor_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) { ; CHECK-LABEL: xor_v4i32_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: veor q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: veort q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -244,10 +228,9 @@ entry: define arm_aapcs_vfpcc <8 x i16> @xor_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) { ; CHECK-LABEL: xor_v8i16_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: veor q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: veort q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -259,10 +242,9 @@ entry: define arm_aapcs_vfpcc <16 x i8> @xor_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) { ; CHECK-LABEL: xor_v16i8_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpsel q1, q1, q2 -; CHECK-NEXT: veor q0, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: veort q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -274,11 +256,10 @@ entry: define arm_aapcs_vfpcc <4 x i32> @andnot_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) { ; CHECK-LABEL: andnot_v4i32_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vmvn q1, q1 ; CHECK-NEXT: vctp.32 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: veort q2, q1, q2 -; CHECK-NEXT: vand q0, q2, q0 +; CHECK-NEXT: vandt q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -291,11 +272,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @andnot_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) { ; CHECK-LABEL: andnot_v8i16_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vmvn q1, q1 ; CHECK-NEXT: vctp.16 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: veort q2, q1, q2 -; CHECK-NEXT: vand q0, q2, q0 +; CHECK-NEXT: vandt q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -308,11 +288,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @andnot_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) { ; CHECK-LABEL: andnot_v16i8_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vmvn q1, q1 ; CHECK-NEXT: vctp.8 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: veort q2, q1, q2 -; CHECK-NEXT: vand q0, q2, q0 +; CHECK-NEXT: vandt q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -325,12 +304,10 @@ entry: define arm_aapcs_vfpcc <4 x i32> @ornot_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) { ; CHECK-LABEL: ornot_v4i32_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vmvn q1, q1 ; CHECK-NEXT: vctp.32 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: veort q2, q1, q3 -; CHECK-NEXT: vorr q0, q2, q0 +; CHECK-NEXT: vorrt q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -343,12 +320,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @ornot_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) { ; CHECK-LABEL: ornot_v8i16_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vmvn q1, q1 ; CHECK-NEXT: vctp.16 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: veort q2, q1, q3 -; CHECK-NEXT: vorr q0, q2, q0 +; CHECK-NEXT: vorrt q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -361,12 +336,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @ornot_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) { ; CHECK-LABEL: ornot_v16i8_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vmvn q1, q1 ; CHECK-NEXT: vctp.8 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: veort q2, q1, q3 -; CHECK-NEXT: vorr q0, q2, q0 +; CHECK-NEXT: vorrt q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -871,11 +844,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @addqr_v4i32_x(<4 x i32> %x, i32 %y, i32 %n) { ; CHECK-LABEL: addqr_v4i32_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vdupt.32 q1, r0 -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vaddt.i32 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -889,11 +860,9 @@ entry: define arm_aapcs_vfpcc <8 x i16> @addqr_v8i16_x(<8 x i16> %x, i16 %y, i32 %n) { ; CHECK-LABEL: addqr_v8i16_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vctp.16 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vdupt.16 q1, r0 -; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vaddt.i16 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -907,11 +876,9 @@ entry: define arm_aapcs_vfpcc <16 x i8> @addqr_v16i8_x(<16 x i8> %x, i8 %y, i32 %n) { ; CHECK-LABEL: addqr_v16i8_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vctp.8 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vdupt.8 q1, r0 -; CHECK-NEXT: vadd.i8 q0, q1, q0 +; CHECK-NEXT: vaddt.i8 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -925,11 +892,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @subqr_v4i32_x(<4 x i32> %x, i32 %y, i32 %n) { ; CHECK-LABEL: subqr_v4i32_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vdupt.32 q1, r0 -; CHECK-NEXT: vsub.i32 q0, q0, q1 +; CHECK-NEXT: vsubt.i32 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -943,11 +908,9 @@ entry: define arm_aapcs_vfpcc <8 x i16> @subqr_v8i16_x(<8 x i16> %x, i16 %y, i32 %n) { ; CHECK-LABEL: subqr_v8i16_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vctp.16 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vdupt.16 q1, r0 -; CHECK-NEXT: vsub.i16 q0, q0, q1 +; CHECK-NEXT: vsubt.i16 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -961,11 +924,9 @@ entry: define arm_aapcs_vfpcc <16 x i8> @subqr_v16i8_x(<16 x i8> %x, i8 %y, i32 %n) { ; CHECK-LABEL: subqr_v16i8_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vctp.8 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vdupt.8 q1, r0 -; CHECK-NEXT: vsub.i8 q0, q0, q1 +; CHECK-NEXT: vsubt.i8 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -979,11 +940,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @mulqr_v4i32_x(<4 x i32> %x, i32 %y, i32 %n) { ; CHECK-LABEL: mulqr_v4i32_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x1 ; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vdupt.32 q1, r0 -; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vmult.i32 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -997,11 +956,9 @@ entry: define arm_aapcs_vfpcc <8 x i16> @mulqr_v8i16_x(<8 x i16> %x, i16 %y, i32 %n) { ; CHECK-LABEL: mulqr_v8i16_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i16 q1, #0x1 ; CHECK-NEXT: vctp.16 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vdupt.16 q1, r0 -; CHECK-NEXT: vmul.i16 q0, q1, q0 +; CHECK-NEXT: vmult.i16 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -1015,11 +972,9 @@ entry: define arm_aapcs_vfpcc <16 x i8> @mulqr_v16i8_x(<16 x i8> %x, i8 %y, i32 %n) { ; CHECK-LABEL: mulqr_v16i8_x: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i8 q1, #0x1 ; CHECK-NEXT: vctp.8 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vdupt.8 q1, r0 -; CHECK-NEXT: vmul.i8 q0, q1, q0 +; CHECK-NEXT: vmult.i8 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -1327,10 +1282,10 @@ entry: define arm_aapcs_vfpcc <4 x i32> @add_v4i32_y(<4 x i32> %x, <4 x i32> %y, i32 %n) { ; CHECK-LABEL: add_v4i32_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i32 q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -1342,10 +1297,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @add_v8i16_y(<8 x i16> %x, <8 x i16> %y, i32 %n) { ; CHECK-LABEL: add_v8i16_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i16 q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -1357,10 +1312,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @add_v16i8_y(<16 x i8> %x, <16 x i8> %y, i32 %n) { ; CHECK-LABEL: add_v16i8_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i8 q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -1417,10 +1372,10 @@ entry: define arm_aapcs_vfpcc <4 x i32> @mul_v4i32_y(<4 x i32> %x, <4 x i32> %y, i32 %n) { ; CHECK-LABEL: mul_v4i32_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x1 ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vmul.i32 q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i32 q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -1432,10 +1387,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @mul_v8i16_y(<8 x i16> %x, <8 x i16> %y, i32 %n) { ; CHECK-LABEL: mul_v8i16_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i16 q2, #0x1 ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i16 q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -1447,10 +1402,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @mul_v16i8_y(<16 x i8> %x, <16 x i8> %y, i32 %n) { ; CHECK-LABEL: mul_v16i8_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i8 q2, #0x1 ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vmul.i8 q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i8 q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -1462,10 +1417,10 @@ entry: define arm_aapcs_vfpcc <4 x i32> @and_v4i32_y(<4 x i32> %x, <4 x i32> %y, i32 %n) { ; CHECK-LABEL: and_v4i32_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vandt q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -1477,10 +1432,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @and_v8i16_y(<8 x i16> %x, <8 x i16> %y, i32 %n) { ; CHECK-LABEL: and_v8i16_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vandt q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -1492,10 +1447,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @and_v16i8_y(<16 x i8> %x, <16 x i8> %y, i32 %n) { ; CHECK-LABEL: and_v16i8_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vandt q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -1507,10 +1462,10 @@ entry: define arm_aapcs_vfpcc <4 x i32> @or_v4i32_y(<4 x i32> %x, <4 x i32> %y, i32 %n) { ; CHECK-LABEL: or_v4i32_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vorrt q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -1522,10 +1477,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @or_v8i16_y(<8 x i16> %x, <8 x i16> %y, i32 %n) { ; CHECK-LABEL: or_v8i16_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vorrt q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -1537,10 +1492,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @or_v16i8_y(<16 x i8> %x, <16 x i8> %y, i32 %n) { ; CHECK-LABEL: or_v16i8_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vorrt q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -1552,10 +1507,10 @@ entry: define arm_aapcs_vfpcc <4 x i32> @xor_v4i32_y(<4 x i32> %x, <4 x i32> %y, i32 %n) { ; CHECK-LABEL: xor_v4i32_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: veort q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -1567,10 +1522,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @xor_v8i16_y(<8 x i16> %x, <8 x i16> %y, i32 %n) { ; CHECK-LABEL: xor_v8i16_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: veort q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -1582,10 +1537,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @xor_v16i8_y(<16 x i8> %x, <16 x i8> %y, i32 %n) { ; CHECK-LABEL: xor_v16i8_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: veort q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -2219,10 +2174,11 @@ entry: define arm_aapcs_vfpcc <4 x i32> @addqr_v4i32_y(<4 x i32> %x, i32 %y, i32 %n) { ; CHECK-LABEL: addqr_v4i32_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vdup.32 q1, r0 ; CHECK-NEXT: vctp.32 r1 -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i32 q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -2236,10 +2192,11 @@ entry: define arm_aapcs_vfpcc <8 x i16> @addqr_v8i16_y(<8 x i16> %x, i16 %y, i32 %n) { ; CHECK-LABEL: addqr_v8i16_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vdup.16 q1, r0 ; CHECK-NEXT: vctp.16 r1 -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vadd.i16 q0, q0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i16 q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -2253,10 +2210,11 @@ entry: define arm_aapcs_vfpcc <16 x i8> @addqr_v16i8_y(<16 x i8> %x, i8 %y, i32 %n) { ; CHECK-LABEL: addqr_v16i8_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vdup.8 q1, r0 ; CHECK-NEXT: vctp.8 r1 -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vadd.i8 q0, q0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i8 q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -2324,10 +2282,11 @@ entry: define arm_aapcs_vfpcc <4 x i32> @mulqr_v4i32_y(<4 x i32> %x, i32 %y, i32 %n) { ; CHECK-LABEL: mulqr_v4i32_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x1 +; CHECK-NEXT: vdup.32 q1, r0 ; CHECK-NEXT: vctp.32 r1 -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmul.i32 q0, q0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i32 q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -2341,10 +2300,11 @@ entry: define arm_aapcs_vfpcc <8 x i16> @mulqr_v8i16_y(<8 x i16> %x, i16 %y, i32 %n) { ; CHECK-LABEL: mulqr_v8i16_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i16 q1, #0x1 +; CHECK-NEXT: vdup.16 q1, r0 ; CHECK-NEXT: vctp.16 r1 -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmul.i16 q0, q0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i16 q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -2358,10 +2318,11 @@ entry: define arm_aapcs_vfpcc <16 x i8> @mulqr_v16i8_y(<16 x i8> %x, i8 %y, i32 %n) { ; CHECK-LABEL: mulqr_v16i8_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i8 q1, #0x1 +; CHECK-NEXT: vdup.8 q1, r0 ; CHECK-NEXT: vctp.8 r1 -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmul.i8 q0, q0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i8 q1, q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll index 918e2db..c8b6dd0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -214,55 +214,54 @@ entry: define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %b) { ; CHECK-LABEL: add_v8i16_v8i32_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: vcmp.i16 eq, q1, zr +; CHECK-NEXT: vmov.i32 q4, #0xffff ; CHECK-NEXT: vpsel q1, q3, q2 -; CHECK-NEXT: vmov.i32 q3, #0xffff -; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.u16 r0, q1[0] ; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.u16 r0, q1[1] ; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.u16 r0, q1[2] ; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r0, q1[3] ; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.32 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] ; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.u16 r0, q1[4] ; CHECK-NEXT: vpst -; CHECK-NEXT: vandt q4, q5, q3 -; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.32 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vandt q2, q3, q4 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmov.u16 r0, q0[4] ; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.u16 r0, q0[5] ; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.u16 r0, q0[7] ; CHECK-NEXT: vmov.32 q1[3], r0 -; CHECK-NEXT: vpt.i32 ne, q5, zr -; CHECK-NEXT: vandt q2, q1, q3 -; CHECK-NEXT: vadd.i32 q0, q2, q4 -; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vmovlb.u16 q0, q1 +; CHECK-NEXT: vpt.i32 ne, q3, zr +; CHECK-NEXT: vaddt.i32 q2, q2, q0 +; CHECK-NEXT: vaddv.u32 r0, q2 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i16> %b, zeroinitializer @@ -275,54 +274,50 @@ entry: define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %b) { ; CHECK-LABEL: add_v8i16_v8i32_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: vcmp.i16 eq, q1, zr ; CHECK-NEXT: vpsel q1, q3, q2 -; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.u16 r0, q1[0] ; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.u16 r0, q1[1] ; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.u16 r0, q1[2] ; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r0, q1[3] ; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.u16 r0, q0[0] ; CHECK-NEXT: vcmp.i32 ne, q2, zr ; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.u16 r0, q0[1] ; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r0, q0[3] ; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmovlb.s16 q2, q2 +; CHECK-NEXT: vpsel q2, q2, q3 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmov.u16 r0, q0[4] ; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.u16 r0, q0[5] ; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmovlb.s16 q3, q2 -; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.u16 r0, q0[7] ; CHECK-NEXT: vmov.32 q1[3], r0 -; CHECK-NEXT: vpsel q3, q3, q2 -; CHECK-NEXT: vcmp.i32 ne, q4, zr ; CHECK-NEXT: vmovlb.s16 q0, q1 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q3 -; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpt.i32 ne, q3, zr +; CHECK-NEXT: vaddt.i32 q2, q2, q0 +; CHECK-NEXT: vaddv.u32 r0, q2 ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i16> %b, zeroinitializer @@ -744,129 +739,128 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vcmp.i8 eq, q1, zr -; CHECK-NEXT: vmov.i8 q4, #0xff -; CHECK-NEXT: vpsel q6, q4, q0 +; CHECK-NEXT: vmov.i8 q5, #0xff +; CHECK-NEXT: vpsel q7, q5, q0 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.u8 r0, q6[8] -; CHECK-NEXT: vmov.i32 q5, #0xff +; CHECK-NEXT: vmov.u8 r0, q7[0] +; CHECK-NEXT: vmov.i32 q6, #0xff ; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q6[9] +; CHECK-NEXT: vmov.u8 r0, q7[1] ; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q6[10] +; CHECK-NEXT: vmov.u8 r0, q7[2] ; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q6[11] +; CHECK-NEXT: vmov.u8 r0, q7[3] ; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q6[12] +; CHECK-NEXT: vmov.u8 r0, q7[4] ; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q6[13] +; CHECK-NEXT: vmov.u8 r0, q7[5] ; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q6[14] +; CHECK-NEXT: vmov.u8 r0, q7[6] ; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q6[15] +; CHECK-NEXT: vmov.u8 r0, q7[7] ; CHECK-NEXT: vmov.16 q1[7], r0 ; CHECK-NEXT: vcmp.i16 ne, q1, zr ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: vpsel q3, q4, q0 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: vmov.u16 r0, q3[4] +; CHECK-NEXT: vpsel q4, q5, q0 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmov.u16 r0, q4[4] ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.u16 r0, q4[5] ; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] +; CHECK-NEXT: vmov.u16 r0, q4[6] ; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] +; CHECK-NEXT: vmov.u16 r0, q4[7] ; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] +; CHECK-NEXT: vmov.u8 r0, q2[4] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] +; CHECK-NEXT: vmov.u8 r0, q2[5] ; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] +; CHECK-NEXT: vmov.u8 r0, q2[6] ; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] +; CHECK-NEXT: vmov.u8 r0, q2[7] ; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q6[0] +; CHECK-NEXT: vmov.u8 r0, q7[8] ; CHECK-NEXT: vpst -; CHECK-NEXT: vandt q7, q0, q5 +; CHECK-NEXT: vandt q3, q0, q6 ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q6[1] +; CHECK-NEXT: vmov.u8 r0, q7[9] ; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q6[2] +; CHECK-NEXT: vmov.u8 r0, q7[10] ; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q6[3] +; CHECK-NEXT: vmov.u8 r0, q7[11] ; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q6[4] +; CHECK-NEXT: vmov.u8 r0, q7[12] ; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q6[5] +; CHECK-NEXT: vmov.u8 r0, q7[13] ; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q6[6] +; CHECK-NEXT: vmov.u8 r0, q7[14] ; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q6[7] +; CHECK-NEXT: vmov.u8 r0, q7[15] ; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov q6, q1 ; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vpsel q4, q4, q0 -; CHECK-NEXT: vmov.u16 r0, q4[4] +; CHECK-NEXT: vpsel q5, q5, q0 +; CHECK-NEXT: vmov.u16 r0, q5[4] ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] +; CHECK-NEXT: vmov.u16 r0, q5[5] ; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] +; CHECK-NEXT: vmov.u16 r0, q5[6] ; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] +; CHECK-NEXT: vmov.u16 r0, q5[7] ; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] +; CHECK-NEXT: vmov.u8 r0, q2[12] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[5] +; CHECK-NEXT: vmov.u8 r0, q2[13] ; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] +; CHECK-NEXT: vmov.u8 r0, q2[14] ; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[7] +; CHECK-NEXT: vmov.u8 r0, q2[15] ; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vandt q6, q0, q5 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vadd.i32 q0, q6, q7 -; CHECK-NEXT: vmov.32 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.32 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.32 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.32 q6[3], r0 -; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vmov.32 q6[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov.32 q6[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.32 q6[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.32 q6[3], r0 ; CHECK-NEXT: vmov.u16 r0, q4[0] +; CHECK-NEXT: vand q0, q0, q6 ; CHECK-NEXT: vpst -; CHECK-NEXT: vandt q3, q6, q5 -; CHECK-NEXT: vmov.32 q6[0], r0 +; CHECK-NEXT: vaddt.i32 q3, q3, q0 +; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vmov.32 q6[1], r0 +; CHECK-NEXT: vmov.32 q0[1], r0 ; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.32 q6[2], r0 +; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.32 q6[3], r0 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.32 q0[1], r0 ; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vpt.i32 ne, q6, zr -; CHECK-NEXT: vandt q1, q4, q5 -; CHECK-NEXT: vadd.i32 q1, q1, q3 -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vmov.32 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q5[0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vandt q1, q0, q6 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.u16 r0, q5[1] +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.u16 r0, q5[2] +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.u16 r0, q5[3] +; CHECK-NEXT: vmov.32 q0[3], r0 +; CHECK-NEXT: vmov.u8 r0, q2[8] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q2[9] +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.u8 r0, q2[10] +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.u8 r0, q2[11] +; CHECK-NEXT: vmov.32 q0[3], r0 +; CHECK-NEXT: vand q0, q0, q6 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i32 q1, q1, q0 +; CHECK-NEXT: vadd.i32 q0, q1, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -886,26 +880,27 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vcmp.i8 eq, q1, zr ; CHECK-NEXT: vmov.i8 q3, #0x0 -; CHECK-NEXT: vmov.i8 q4, #0xff -; CHECK-NEXT: vpsel q5, q4, q3 -; CHECK-NEXT: vmov.u8 r0, q5[8] +; CHECK-NEXT: vmov.i8 q5, #0xff +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vpsel q6, q5, q3 +; CHECK-NEXT: vmov.u8 r0, q6[0] ; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q5[9] +; CHECK-NEXT: vmov.u8 r0, q6[1] ; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q5[10] +; CHECK-NEXT: vmov.u8 r0, q6[2] ; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q5[11] +; CHECK-NEXT: vmov.u8 r0, q6[3] ; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q5[12] +; CHECK-NEXT: vmov.u8 r0, q6[4] ; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q5[13] +; CHECK-NEXT: vmov.u8 r0, q6[5] ; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q5[14] +; CHECK-NEXT: vmov.u8 r0, q6[6] ; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q5[15] +; CHECK-NEXT: vmov.u8 r0, q6[7] ; CHECK-NEXT: vmov.16 q1[7], r0 ; CHECK-NEXT: vcmp.i16 ne, q1, zr -; CHECK-NEXT: vpsel q2, q4, q3 +; CHECK-NEXT: vpsel q2, q5, q3 ; CHECK-NEXT: vmov.u16 r0, q2[4] ; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u16 r0, q2[5] @@ -914,55 +909,58 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vmov.32 q1[2], r0 ; CHECK-NEXT: vmov.u16 r0, q2[7] ; CHECK-NEXT: vmov.32 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.u8 r0, q0[4] ; CHECK-NEXT: vcmp.i32 ne, q1, zr ; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.u8 r0, q0[5] ; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.u8 r0, q0[6] ; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.u8 r0, q0[7] ; CHECK-NEXT: vmov.32 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q5[0] +; CHECK-NEXT: vmov.u8 r0, q6[8] ; CHECK-NEXT: vmov.16 q7[0], r0 -; CHECK-NEXT: vmov.u8 r0, q5[1] +; CHECK-NEXT: vmov.u8 r0, q6[9] ; CHECK-NEXT: vmov.16 q7[1], r0 -; CHECK-NEXT: vmov.u8 r0, q5[2] +; CHECK-NEXT: vmov.u8 r0, q6[10] ; CHECK-NEXT: vmov.16 q7[2], r0 -; CHECK-NEXT: vmov.u8 r0, q5[3] +; CHECK-NEXT: vmov.u8 r0, q6[11] ; CHECK-NEXT: vmov.16 q7[3], r0 -; CHECK-NEXT: vmov.u8 r0, q5[4] +; CHECK-NEXT: vmov.u8 r0, q6[12] ; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov.u8 r0, q5[5] +; CHECK-NEXT: vmov.u8 r0, q6[13] ; CHECK-NEXT: vmov.16 q7[5], r0 -; CHECK-NEXT: vmov.u8 r0, q5[6] +; CHECK-NEXT: vmov.u8 r0, q6[14] ; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmov.16 q7[6], r0 -; CHECK-NEXT: vmov.u8 r0, q5[7] -; CHECK-NEXT: vmovlb.s16 q6, q1 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.u8 r0, q6[15] +; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmov.16 q7[7], r0 -; CHECK-NEXT: vpsel q6, q6, q1 +; CHECK-NEXT: vpsel q1, q1, q4 ; CHECK-NEXT: vcmp.i16 ne, q7, zr -; CHECK-NEXT: vpsel q3, q4, q3 +; CHECK-NEXT: vpsel q3, q5, q3 ; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.32 q5[0], r0 ; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.32 q5[1], r0 ; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.32 q5[2], r0 ; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vcmp.i32 ne, q4, zr -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.32 q5[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vcmp.i32 ne, q5, zr +; CHECK-NEXT: vmov.32 q5[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.32 q5[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.32 q5[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.32 q5[3], r0 ; CHECK-NEXT: vmov.u16 r0, q2[0] +; CHECK-NEXT: vmovlb.s8 q5, q5 +; CHECK-NEXT: vmovlb.s16 q5, q5 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i32 q1, q1, q5 ; CHECK-NEXT: vmov.32 q5[0], r0 ; CHECK-NEXT: vmov.u16 r0, q2[1] ; CHECK-NEXT: vmov.32 q5[1], r0 @@ -970,44 +968,39 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vmov.32 q5[2], r0 ; CHECK-NEXT: vmov.u16 r0, q2[3] ; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.u8 r0, q0[0] ; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.u8 r0, q0[1] ; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmovlb.s8 q4, q4 +; CHECK-NEXT: vmov.u8 r0, q0[2] ; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmovlb.s16 q4, q4 +; CHECK-NEXT: vmov.u8 r0, q0[3] ; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vpsel q4, q4, q1 ; CHECK-NEXT: vcmp.i32 ne, q5, zr -; CHECK-NEXT: vmov.32 q5[0], r0 +; CHECK-NEXT: vmovlb.s8 q2, q2 +; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vmovlb.s16 q2, q2 +; CHECK-NEXT: vpsel q2, q2, q4 +; CHECK-NEXT: vmov.32 q4[0], r0 ; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.32 q5[1], r0 +; CHECK-NEXT: vmov.32 q4[1], r0 ; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.32 q5[2], r0 +; CHECK-NEXT: vmov.32 q4[2], r0 ; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[8] ; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.u8 r0, q0[9] ; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.u8 r0, q0[10] ; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmovlb.s8 q2, q2 +; CHECK-NEXT: vmov.u8 r0, q0[11] ; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmovlb.s16 q2, q2 ; CHECK-NEXT: vmovlb.s8 q0, q3 -; CHECK-NEXT: vpsel q2, q2, q1 -; CHECK-NEXT: vcmp.i32 ne, q5, zr ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vadd.i32 q4, q4, q6 -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q4 +; CHECK-NEXT: vpt.i32 ne, q4, zr +; CHECK-NEXT: vaddt.i32 q2, q2, q0 +; CHECK-NEXT: vadd.i32 q0, q2, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -1061,87 +1054,83 @@ entry: define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-LABEL: add_v16i8_v16i16_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vcmp.i8 eq, q1, zr ; CHECK-NEXT: vmov.i8 q1, #0x0 ; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: vpsel q1, q2, q1 -; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.u8 r0, q1[0] ; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.u8 r0, q1[1] ; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.u8 r0, q1[2] ; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.u8 r0, q1[3] ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.u8 r0, q1[4] ; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.u8 r0, q1[5] ; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.u8 r0, q1[6] ; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.u8 r0, q1[7] ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.u8 r0, q0[0] ; CHECK-NEXT: vcmp.i16 ne, q2, zr ; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.u8 r0, q0[1] ; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.u8 r0, q0[2] ; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.u8 r0, q0[3] ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.u8 r0, q0[4] ; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.u8 r0, q0[5] ; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.u8 r0, q0[6] ; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.u8 r0, q0[7] ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmovlb.u8 q2, q2 +; CHECK-NEXT: vpsel q2, q2, q3 +; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.u8 r0, q0[8] ; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.u8 r0, q0[9] ; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.u8 r0, q0[10] ; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.u8 r0, q0[11] ; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.u8 r0, q0[12] ; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.u8 r0, q0[13] ; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.u8 r0, q0[14] ; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmovlb.u8 q3, q2 -; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.u8 r0, q0[15] ; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vpsel q3, q3, q2 -; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vmovlb.u8 q0, q1 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vadd.i16 q0, q0, q3 -; CHECK-NEXT: vaddv.u16 r0, q0 +; CHECK-NEXT: vpt.i16 ne, q3, zr +; CHECK-NEXT: vaddt.i16 q2, q2, q0 +; CHECK-NEXT: vaddv.u16 r0, q2 ; CHECK-NEXT: uxth r0, r0 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %c = icmp eq <16 x i8> %b, zeroinitializer @@ -1154,87 +1143,83 @@ entry: define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-LABEL: add_v16i8_v16i16_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vcmp.i8 eq, q1, zr ; CHECK-NEXT: vmov.i8 q1, #0x0 ; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: vpsel q1, q2, q1 -; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.u8 r0, q1[0] ; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.u8 r0, q1[1] ; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.u8 r0, q1[2] ; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.u8 r0, q1[3] ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.u8 r0, q1[4] ; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.u8 r0, q1[5] ; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.u8 r0, q1[6] ; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.u8 r0, q1[7] ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.u8 r0, q0[0] ; CHECK-NEXT: vcmp.i16 ne, q2, zr ; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.u8 r0, q0[1] ; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.u8 r0, q0[2] ; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.u8 r0, q0[3] ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.u8 r0, q0[4] ; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.u8 r0, q0[5] ; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.u8 r0, q0[6] ; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.u8 r0, q0[7] ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmovlb.s8 q2, q2 +; CHECK-NEXT: vpsel q2, q2, q3 +; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.u8 r0, q0[8] ; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.u8 r0, q0[9] ; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.u8 r0, q0[10] ; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.u8 r0, q0[11] ; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.u8 r0, q0[12] ; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.u8 r0, q0[13] ; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.u8 r0, q0[14] ; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmovlb.s8 q3, q2 -; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.u8 r0, q0[15] ; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vpsel q3, q3, q2 -; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vmovlb.s8 q0, q1 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vadd.i16 q0, q0, q3 -; CHECK-NEXT: vaddv.u16 r0, q0 +; CHECK-NEXT: vpt.i16 ne, q3, zr +; CHECK-NEXT: vaddt.i16 q2, q2, q0 +; CHECK-NEXT: vaddv.u16 r0, q2 ; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %c = icmp eq <16 x i8> %b, zeroinitializer @@ -2226,55 +2211,54 @@ entry: define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %b, i32 %a) { ; CHECK-LABEL: add_v8i16_v8i32_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: vcmp.i16 eq, q1, zr +; CHECK-NEXT: vmov.i32 q4, #0xffff ; CHECK-NEXT: vpsel q1, q3, q2 -; CHECK-NEXT: vmov.i32 q3, #0xffff -; CHECK-NEXT: vmov.u16 r1, q1[4] +; CHECK-NEXT: vmov.u16 r1, q1[0] ; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[5] +; CHECK-NEXT: vmov.u16 r1, q1[1] ; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[6] +; CHECK-NEXT: vmov.u16 r1, q1[2] ; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[7] +; CHECK-NEXT: vmov.u16 r1, q1[3] ; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov.32 q5[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.32 q5[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vmov.32 q3[1], r1 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.32 q3[2], r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] ; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmov.32 q5[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.32 q3[3], r1 ; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: vmov.32 q5[3], r1 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmov.u16 r1, q1[0] +; CHECK-NEXT: vmov.u16 r1, q1[4] ; CHECK-NEXT: vpst -; CHECK-NEXT: vandt q4, q5, q3 -; CHECK-NEXT: vmov.32 q5[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.32 q5[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov.32 q5[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov.32 q5[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vandt q2, q3, q4 +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.u16 r1, q1[5] +; CHECK-NEXT: vmov.32 q3[1], r1 +; CHECK-NEXT: vmov.u16 r1, q1[6] +; CHECK-NEXT: vmov.32 q3[2], r1 +; CHECK-NEXT: vmov.u16 r1, q1[7] +; CHECK-NEXT: vmov.32 q3[3], r1 +; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vmov.u16 r1, q0[5] ; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r1, q0[7] ; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vpt.i32 ne, q5, zr -; CHECK-NEXT: vandt q2, q1, q3 -; CHECK-NEXT: vadd.i32 q0, q2, q4 -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vmovlb.u16 q0, q1 +; CHECK-NEXT: vpt.i32 ne, q3, zr +; CHECK-NEXT: vaddt.i32 q2, q2, q0 +; CHECK-NEXT: vaddva.u32 r0, q2 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i16> %b, zeroinitializer @@ -2288,54 +2272,50 @@ entry: define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %b, i32 %a) { ; CHECK-LABEL: add_v8i16_v8i32_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: vcmp.i16 eq, q1, zr ; CHECK-NEXT: vpsel q1, q3, q2 -; CHECK-NEXT: vmov.u16 r1, q1[4] +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.u16 r1, q1[0] ; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[5] +; CHECK-NEXT: vmov.u16 r1, q1[1] ; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[6] +; CHECK-NEXT: vmov.u16 r1, q1[2] ; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[7] +; CHECK-NEXT: vmov.u16 r1, q1[3] ; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov.u16 r1, q0[0] ; CHECK-NEXT: vcmp.i32 ne, q2, zr ; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.u16 r1, q0[1] ; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vmov.u16 r1, q0[2] ; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[3] ; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov.32 q4[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.32 q4[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov.32 q4[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov.32 q4[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vmov.u16 r1, q1[4] +; CHECK-NEXT: vmovlb.s16 q2, q2 +; CHECK-NEXT: vpsel q2, q2, q3 +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.u16 r1, q1[5] +; CHECK-NEXT: vmov.32 q3[1], r1 +; CHECK-NEXT: vmov.u16 r1, q1[6] +; CHECK-NEXT: vmov.32 q3[2], r1 +; CHECK-NEXT: vmov.u16 r1, q1[7] +; CHECK-NEXT: vmov.32 q3[3], r1 +; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vmov.u16 r1, q0[5] ; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmovlb.s16 q3, q2 -; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.u16 r1, q0[7] ; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vpsel q3, q3, q2 -; CHECK-NEXT: vcmp.i32 ne, q4, zr ; CHECK-NEXT: vmovlb.s16 q0, q1 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q3 -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpt.i32 ne, q3, zr +; CHECK-NEXT: vaddt.i32 q2, q2, q0 +; CHECK-NEXT: vaddva.u32 r0, q2 ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i16> %b, zeroinitializer @@ -2779,129 +2759,128 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vcmp.i8 eq, q1, zr -; CHECK-NEXT: vmov.i8 q4, #0xff -; CHECK-NEXT: vpsel q6, q4, q0 +; CHECK-NEXT: vmov.i8 q5, #0xff +; CHECK-NEXT: vpsel q7, q5, q0 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.u8 r1, q6[8] -; CHECK-NEXT: vmov.i32 q5, #0xff +; CHECK-NEXT: vmov.u8 r1, q7[0] +; CHECK-NEXT: vmov.i32 q6, #0xff ; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q6[9] +; CHECK-NEXT: vmov.u8 r1, q7[1] ; CHECK-NEXT: vmov.16 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q6[10] +; CHECK-NEXT: vmov.u8 r1, q7[2] ; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q6[11] +; CHECK-NEXT: vmov.u8 r1, q7[3] ; CHECK-NEXT: vmov.16 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q6[12] +; CHECK-NEXT: vmov.u8 r1, q7[4] ; CHECK-NEXT: vmov.16 q1[4], r1 -; CHECK-NEXT: vmov.u8 r1, q6[13] +; CHECK-NEXT: vmov.u8 r1, q7[5] ; CHECK-NEXT: vmov.16 q1[5], r1 -; CHECK-NEXT: vmov.u8 r1, q6[14] +; CHECK-NEXT: vmov.u8 r1, q7[6] ; CHECK-NEXT: vmov.16 q1[6], r1 -; CHECK-NEXT: vmov.u8 r1, q6[15] +; CHECK-NEXT: vmov.u8 r1, q7[7] ; CHECK-NEXT: vmov.16 q1[7], r1 ; CHECK-NEXT: vcmp.i16 ne, q1, zr ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: vpsel q3, q4, q0 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: vmov.u16 r1, q3[4] +; CHECK-NEXT: vpsel q4, q5, q0 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmov.u16 r1, q4[4] ; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: vmov.u16 r1, q3[5] +; CHECK-NEXT: vmov.u16 r1, q4[5] ; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov.u16 r1, q3[6] +; CHECK-NEXT: vmov.u16 r1, q4[6] ; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.u16 r1, q3[7] +; CHECK-NEXT: vmov.u16 r1, q4[7] ; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmov.u8 r1, q2[12] +; CHECK-NEXT: vmov.u8 r1, q2[4] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: vmov.u8 r1, q2[13] +; CHECK-NEXT: vmov.u8 r1, q2[5] ; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov.u8 r1, q2[14] +; CHECK-NEXT: vmov.u8 r1, q2[6] ; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.u8 r1, q2[15] +; CHECK-NEXT: vmov.u8 r1, q2[7] ; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmov.u8 r1, q6[0] +; CHECK-NEXT: vmov.u8 r1, q7[8] ; CHECK-NEXT: vpst -; CHECK-NEXT: vandt q7, q0, q5 +; CHECK-NEXT: vandt q3, q0, q6 ; CHECK-NEXT: vmov.16 q0[0], r1 -; CHECK-NEXT: vmov.u8 r1, q6[1] +; CHECK-NEXT: vmov.u8 r1, q7[9] ; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov.u8 r1, q6[2] +; CHECK-NEXT: vmov.u8 r1, q7[10] ; CHECK-NEXT: vmov.16 q0[2], r1 -; CHECK-NEXT: vmov.u8 r1, q6[3] +; CHECK-NEXT: vmov.u8 r1, q7[11] ; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: vmov.u8 r1, q6[4] +; CHECK-NEXT: vmov.u8 r1, q7[12] ; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov.u8 r1, q6[5] +; CHECK-NEXT: vmov.u8 r1, q7[13] ; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.u8 r1, q6[6] +; CHECK-NEXT: vmov.u8 r1, q7[14] ; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: vmov.u8 r1, q6[7] +; CHECK-NEXT: vmov.u8 r1, q7[15] ; CHECK-NEXT: vmov.16 q0[7], r1 -; CHECK-NEXT: vmov q6, q1 ; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vpsel q4, q4, q0 -; CHECK-NEXT: vmov.u16 r1, q4[4] +; CHECK-NEXT: vpsel q5, q5, q0 +; CHECK-NEXT: vmov.u16 r1, q5[4] ; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: vmov.u16 r1, q4[5] +; CHECK-NEXT: vmov.u16 r1, q5[5] ; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov.u16 r1, q4[6] +; CHECK-NEXT: vmov.u16 r1, q5[6] ; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.u16 r1, q4[7] +; CHECK-NEXT: vmov.u16 r1, q5[7] ; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmov.u8 r1, q2[4] +; CHECK-NEXT: vmov.u8 r1, q2[12] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: vmov.u8 r1, q2[5] +; CHECK-NEXT: vmov.u8 r1, q2[13] ; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov.u8 r1, q2[6] +; CHECK-NEXT: vmov.u8 r1, q2[14] ; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.u8 r1, q2[7] +; CHECK-NEXT: vmov.u8 r1, q2[15] ; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vpst -; CHECK-NEXT: vandt q6, q0, q5 -; CHECK-NEXT: vmov.u16 r1, q3[0] -; CHECK-NEXT: vadd.i32 q0, q6, q7 -; CHECK-NEXT: vmov.32 q6[0], r1 -; CHECK-NEXT: vmov.u16 r1, q3[1] -; CHECK-NEXT: vmov.32 q6[1], r1 -; CHECK-NEXT: vmov.u16 r1, q3[2] -; CHECK-NEXT: vmov.32 q6[2], r1 -; CHECK-NEXT: vmov.u16 r1, q3[3] -; CHECK-NEXT: vmov.32 q6[3], r1 -; CHECK-NEXT: vmov.u8 r1, q2[8] -; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vmov.32 q6[0], r1 -; CHECK-NEXT: vmov.u8 r1, q2[9] -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov.32 q6[1], r1 -; CHECK-NEXT: vmov.u8 r1, q2[10] -; CHECK-NEXT: vmov.32 q6[2], r1 -; CHECK-NEXT: vmov.u8 r1, q2[11] -; CHECK-NEXT: vmov.32 q6[3], r1 ; CHECK-NEXT: vmov.u16 r1, q4[0] +; CHECK-NEXT: vand q0, q0, q6 ; CHECK-NEXT: vpst -; CHECK-NEXT: vandt q3, q6, q5 -; CHECK-NEXT: vmov.32 q6[0], r1 +; CHECK-NEXT: vaddt.i32 q3, q3, q0 +; CHECK-NEXT: vmov.32 q0[0], r1 ; CHECK-NEXT: vmov.u16 r1, q4[1] -; CHECK-NEXT: vmov.32 q6[1], r1 +; CHECK-NEXT: vmov.32 q0[1], r1 ; CHECK-NEXT: vmov.u16 r1, q4[2] -; CHECK-NEXT: vmov.32 q6[2], r1 +; CHECK-NEXT: vmov.32 q0[2], r1 ; CHECK-NEXT: vmov.u16 r1, q4[3] -; CHECK-NEXT: vmov.32 q6[3], r1 +; CHECK-NEXT: vmov.32 q0[3], r1 ; CHECK-NEXT: vmov.u8 r1, q2[0] -; CHECK-NEXT: vmov.32 q4[0], r1 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov.32 q0[0], r1 ; CHECK-NEXT: vmov.u8 r1, q2[1] -; CHECK-NEXT: vmov.32 q4[1], r1 +; CHECK-NEXT: vmov.32 q0[1], r1 ; CHECK-NEXT: vmov.u8 r1, q2[2] -; CHECK-NEXT: vmov.32 q4[2], r1 +; CHECK-NEXT: vmov.32 q0[2], r1 ; CHECK-NEXT: vmov.u8 r1, q2[3] -; CHECK-NEXT: vmov.32 q4[3], r1 -; CHECK-NEXT: vpt.i32 ne, q6, zr -; CHECK-NEXT: vandt q1, q4, q5 -; CHECK-NEXT: vadd.i32 q1, q1, q3 -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: vmov.u16 r1, q5[0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vandt q1, q0, q6 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.u16 r1, q5[1] +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.u16 r1, q5[2] +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.u16 r1, q5[3] +; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: vmov.u8 r1, q2[8] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.u8 r1, q2[9] +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.u8 r1, q2[10] +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.u8 r1, q2[11] +; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: vand q0, q0, q6 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i32 q1, q1, q0 +; CHECK-NEXT: vadd.i32 q0, q1, q3 ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -2922,26 +2901,27 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vcmp.i8 eq, q1, zr ; CHECK-NEXT: vmov.i8 q3, #0x0 -; CHECK-NEXT: vmov.i8 q4, #0xff -; CHECK-NEXT: vpsel q5, q4, q3 -; CHECK-NEXT: vmov.u8 r1, q5[8] +; CHECK-NEXT: vmov.i8 q5, #0xff +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vpsel q6, q5, q3 +; CHECK-NEXT: vmov.u8 r1, q6[0] ; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q5[9] +; CHECK-NEXT: vmov.u8 r1, q6[1] ; CHECK-NEXT: vmov.16 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q5[10] +; CHECK-NEXT: vmov.u8 r1, q6[2] ; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q5[11] +; CHECK-NEXT: vmov.u8 r1, q6[3] ; CHECK-NEXT: vmov.16 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q5[12] +; CHECK-NEXT: vmov.u8 r1, q6[4] ; CHECK-NEXT: vmov.16 q1[4], r1 -; CHECK-NEXT: vmov.u8 r1, q5[13] +; CHECK-NEXT: vmov.u8 r1, q6[5] ; CHECK-NEXT: vmov.16 q1[5], r1 -; CHECK-NEXT: vmov.u8 r1, q5[14] +; CHECK-NEXT: vmov.u8 r1, q6[6] ; CHECK-NEXT: vmov.16 q1[6], r1 -; CHECK-NEXT: vmov.u8 r1, q5[15] +; CHECK-NEXT: vmov.u8 r1, q6[7] ; CHECK-NEXT: vmov.16 q1[7], r1 ; CHECK-NEXT: vcmp.i16 ne, q1, zr -; CHECK-NEXT: vpsel q2, q4, q3 +; CHECK-NEXT: vpsel q2, q5, q3 ; CHECK-NEXT: vmov.u16 r1, q2[4] ; CHECK-NEXT: vmov.32 q1[0], r1 ; CHECK-NEXT: vmov.u16 r1, q2[5] @@ -2950,55 +2930,58 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov.32 q1[2], r1 ; CHECK-NEXT: vmov.u16 r1, q2[7] ; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: vmov.u8 r1, q0[4] ; CHECK-NEXT: vcmp.i32 ne, q1, zr ; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: vmov.u8 r1, q0[5] ; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: vmov.u8 r1, q0[6] ; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r1, q0[7] ; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q5[0] +; CHECK-NEXT: vmov.u8 r1, q6[8] ; CHECK-NEXT: vmov.16 q7[0], r1 -; CHECK-NEXT: vmov.u8 r1, q5[1] +; CHECK-NEXT: vmov.u8 r1, q6[9] ; CHECK-NEXT: vmov.16 q7[1], r1 -; CHECK-NEXT: vmov.u8 r1, q5[2] +; CHECK-NEXT: vmov.u8 r1, q6[10] ; CHECK-NEXT: vmov.16 q7[2], r1 -; CHECK-NEXT: vmov.u8 r1, q5[3] +; CHECK-NEXT: vmov.u8 r1, q6[11] ; CHECK-NEXT: vmov.16 q7[3], r1 -; CHECK-NEXT: vmov.u8 r1, q5[4] +; CHECK-NEXT: vmov.u8 r1, q6[12] ; CHECK-NEXT: vmov.16 q7[4], r1 -; CHECK-NEXT: vmov.u8 r1, q5[5] +; CHECK-NEXT: vmov.u8 r1, q6[13] ; CHECK-NEXT: vmov.16 q7[5], r1 -; CHECK-NEXT: vmov.u8 r1, q5[6] +; CHECK-NEXT: vmov.u8 r1, q6[14] ; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmov.16 q7[6], r1 -; CHECK-NEXT: vmov.u8 r1, q5[7] -; CHECK-NEXT: vmovlb.s16 q6, q1 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.u8 r1, q6[15] +; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmov.16 q7[7], r1 -; CHECK-NEXT: vpsel q6, q6, q1 +; CHECK-NEXT: vpsel q1, q1, q4 ; CHECK-NEXT: vcmp.i16 ne, q7, zr -; CHECK-NEXT: vpsel q3, q4, q3 +; CHECK-NEXT: vpsel q3, q5, q3 ; CHECK-NEXT: vmov.u16 r1, q3[4] -; CHECK-NEXT: vmov.32 q4[0], r1 +; CHECK-NEXT: vmov.32 q5[0], r1 ; CHECK-NEXT: vmov.u16 r1, q3[5] -; CHECK-NEXT: vmov.32 q4[1], r1 +; CHECK-NEXT: vmov.32 q5[1], r1 ; CHECK-NEXT: vmov.u16 r1, q3[6] -; CHECK-NEXT: vmov.32 q4[2], r1 +; CHECK-NEXT: vmov.32 q5[2], r1 ; CHECK-NEXT: vmov.u16 r1, q3[7] -; CHECK-NEXT: vmov.32 q4[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vcmp.i32 ne, q4, zr -; CHECK-NEXT: vmov.32 q4[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov.32 q4[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: vmov.32 q4[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov.32 q4[3], r1 +; CHECK-NEXT: vmov.32 q5[3], r1 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: vcmp.i32 ne, q5, zr +; CHECK-NEXT: vmov.32 q5[0], r1 +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: vmov.32 q5[1], r1 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: vmov.32 q5[2], r1 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.32 q5[3], r1 ; CHECK-NEXT: vmov.u16 r1, q2[0] +; CHECK-NEXT: vmovlb.s8 q5, q5 +; CHECK-NEXT: vmovlb.s16 q5, q5 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i32 q1, q1, q5 ; CHECK-NEXT: vmov.32 q5[0], r1 ; CHECK-NEXT: vmov.u16 r1, q2[1] ; CHECK-NEXT: vmov.32 q5[1], r1 @@ -3006,44 +2989,39 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov.32 q5[2], r1 ; CHECK-NEXT: vmov.u16 r1, q2[3] ; CHECK-NEXT: vmov.32 q5[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: vmov.u8 r1, q0[0] ; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: vmov.u8 r1, q0[1] ; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: vmovlb.s8 q4, q4 +; CHECK-NEXT: vmov.u8 r1, q0[2] ; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: vmovlb.s16 q4, q4 +; CHECK-NEXT: vmov.u8 r1, q0[3] ; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q3[0] -; CHECK-NEXT: vpsel q4, q4, q1 ; CHECK-NEXT: vcmp.i32 ne, q5, zr -; CHECK-NEXT: vmov.32 q5[0], r1 +; CHECK-NEXT: vmovlb.s8 q2, q2 +; CHECK-NEXT: vmov.u16 r1, q3[0] +; CHECK-NEXT: vmovlb.s16 q2, q2 +; CHECK-NEXT: vpsel q2, q2, q4 +; CHECK-NEXT: vmov.32 q4[0], r1 ; CHECK-NEXT: vmov.u16 r1, q3[1] -; CHECK-NEXT: vmov.32 q5[1], r1 +; CHECK-NEXT: vmov.32 q4[1], r1 ; CHECK-NEXT: vmov.u16 r1, q3[2] -; CHECK-NEXT: vmov.32 q5[2], r1 +; CHECK-NEXT: vmov.32 q4[2], r1 ; CHECK-NEXT: vmov.u16 r1, q3[3] -; CHECK-NEXT: vmov.32 q5[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: vmov.32 q4[3], r1 +; CHECK-NEXT: vmov.u8 r1, q0[8] ; CHECK-NEXT: vmov.32 q3[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: vmov.u8 r1, q0[9] ; CHECK-NEXT: vmov.32 q3[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: vmov.u8 r1, q0[10] ; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmovlb.s8 q2, q2 +; CHECK-NEXT: vmov.u8 r1, q0[11] ; CHECK-NEXT: vmov.32 q3[3], r1 -; CHECK-NEXT: vmovlb.s16 q2, q2 ; CHECK-NEXT: vmovlb.s8 q0, q3 -; CHECK-NEXT: vpsel q2, q2, q1 -; CHECK-NEXT: vcmp.i32 ne, q5, zr ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vadd.i32 q4, q4, q6 -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q4 +; CHECK-NEXT: vpt.i32 ne, q4, zr +; CHECK-NEXT: vaddt.i32 q2, q2, q0 +; CHECK-NEXT: vadd.i32 q0, q2, q1 ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -3100,87 +3078,83 @@ entry: define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %b, i16 %a) { ; CHECK-LABEL: add_v16i8_v16i16_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vcmp.i8 eq, q1, zr ; CHECK-NEXT: vmov.i8 q1, #0x0 ; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: vpsel q1, q2, q1 -; CHECK-NEXT: vmov.u8 r1, q1[8] +; CHECK-NEXT: vmov.u8 r1, q1[0] ; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[9] +; CHECK-NEXT: vmov.u8 r1, q1[1] ; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q1[10] +; CHECK-NEXT: vmov.u8 r1, q1[2] ; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q1[11] +; CHECK-NEXT: vmov.u8 r1, q1[3] ; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q1[12] +; CHECK-NEXT: vmov.u8 r1, q1[4] ; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u8 r1, q1[13] +; CHECK-NEXT: vmov.u8 r1, q1[5] ; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u8 r1, q1[14] +; CHECK-NEXT: vmov.u8 r1, q1[6] ; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u8 r1, q1[15] +; CHECK-NEXT: vmov.u8 r1, q1[7] ; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: vmov.u8 r1, q0[0] ; CHECK-NEXT: vcmp.i16 ne, q2, zr ; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: vmov.u8 r1, q0[1] ; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: vmov.u8 r1, q0[2] ; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: vmov.u8 r1, q0[3] ; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: vmov.u8 r1, q0[4] ; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: vmov.u8 r1, q0[5] ; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: vmov.u8 r1, q0[6] ; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r1, q0[7] ; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vmov.u8 r1, q1[0] -; CHECK-NEXT: vmov.16 q4[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[1] -; CHECK-NEXT: vmov.16 q4[1], r1 -; CHECK-NEXT: vmov.u8 r1, q1[2] -; CHECK-NEXT: vmov.16 q4[2], r1 -; CHECK-NEXT: vmov.u8 r1, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r1 -; CHECK-NEXT: vmov.u8 r1, q1[4] -; CHECK-NEXT: vmov.16 q4[4], r1 -; CHECK-NEXT: vmov.u8 r1, q1[5] -; CHECK-NEXT: vmov.16 q4[5], r1 -; CHECK-NEXT: vmov.u8 r1, q1[6] -; CHECK-NEXT: vmov.16 q4[6], r1 -; CHECK-NEXT: vmov.u8 r1, q1[7] -; CHECK-NEXT: vmov.16 q4[7], r1 -; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: vmov.u8 r1, q1[8] +; CHECK-NEXT: vmovlb.u8 q2, q2 +; CHECK-NEXT: vpsel q2, q2, q3 +; CHECK-NEXT: vmov.16 q3[0], r1 +; CHECK-NEXT: vmov.u8 r1, q1[9] +; CHECK-NEXT: vmov.16 q3[1], r1 +; CHECK-NEXT: vmov.u8 r1, q1[10] +; CHECK-NEXT: vmov.16 q3[2], r1 +; CHECK-NEXT: vmov.u8 r1, q1[11] +; CHECK-NEXT: vmov.16 q3[3], r1 +; CHECK-NEXT: vmov.u8 r1, q1[12] +; CHECK-NEXT: vmov.16 q3[4], r1 +; CHECK-NEXT: vmov.u8 r1, q1[13] +; CHECK-NEXT: vmov.16 q3[5], r1 +; CHECK-NEXT: vmov.u8 r1, q1[14] +; CHECK-NEXT: vmov.16 q3[6], r1 +; CHECK-NEXT: vmov.u8 r1, q1[15] +; CHECK-NEXT: vmov.16 q3[7], r1 +; CHECK-NEXT: vmov.u8 r1, q0[8] ; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: vmov.u8 r1, q0[9] ; CHECK-NEXT: vmov.16 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: vmov.u8 r1, q0[10] ; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: vmov.u8 r1, q0[11] ; CHECK-NEXT: vmov.16 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.u8 r1, q0[12] ; CHECK-NEXT: vmov.16 q1[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: vmov.u8 r1, q0[13] ; CHECK-NEXT: vmov.16 q1[5], r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: vmov.16 q1[6], r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmovlb.u8 q3, q2 -; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.u8 r1, q0[15] ; CHECK-NEXT: vmov.16 q1[7], r1 -; CHECK-NEXT: vpsel q3, q3, q2 -; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vmovlb.u8 q0, q1 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vadd.i16 q0, q0, q3 -; CHECK-NEXT: vaddva.u16 r0, q0 +; CHECK-NEXT: vpt.i16 ne, q3, zr +; CHECK-NEXT: vaddt.i16 q2, q2, q0 +; CHECK-NEXT: vaddva.u16 r0, q2 ; CHECK-NEXT: uxth r0, r0 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %c = icmp eq <16 x i8> %b, zeroinitializer @@ -3194,87 +3168,83 @@ entry: define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %b, i16 %a) { ; CHECK-LABEL: add_v16i8_v16i16_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vcmp.i8 eq, q1, zr ; CHECK-NEXT: vmov.i8 q1, #0x0 ; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: vpsel q1, q2, q1 -; CHECK-NEXT: vmov.u8 r1, q1[8] +; CHECK-NEXT: vmov.u8 r1, q1[0] ; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[9] +; CHECK-NEXT: vmov.u8 r1, q1[1] ; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q1[10] +; CHECK-NEXT: vmov.u8 r1, q1[2] ; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q1[11] +; CHECK-NEXT: vmov.u8 r1, q1[3] ; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q1[12] +; CHECK-NEXT: vmov.u8 r1, q1[4] ; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u8 r1, q1[13] +; CHECK-NEXT: vmov.u8 r1, q1[5] ; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u8 r1, q1[14] +; CHECK-NEXT: vmov.u8 r1, q1[6] ; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u8 r1, q1[15] +; CHECK-NEXT: vmov.u8 r1, q1[7] ; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: vmov.u8 r1, q0[0] ; CHECK-NEXT: vcmp.i16 ne, q2, zr ; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: vmov.u8 r1, q0[1] ; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: vmov.u8 r1, q0[2] ; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: vmov.u8 r1, q0[3] ; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: vmov.u8 r1, q0[4] ; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: vmov.u8 r1, q0[5] ; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: vmov.u8 r1, q0[6] ; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r1, q0[7] ; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vmov.u8 r1, q1[0] -; CHECK-NEXT: vmov.16 q4[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[1] -; CHECK-NEXT: vmov.16 q4[1], r1 -; CHECK-NEXT: vmov.u8 r1, q1[2] -; CHECK-NEXT: vmov.16 q4[2], r1 -; CHECK-NEXT: vmov.u8 r1, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r1 -; CHECK-NEXT: vmov.u8 r1, q1[4] -; CHECK-NEXT: vmov.16 q4[4], r1 -; CHECK-NEXT: vmov.u8 r1, q1[5] -; CHECK-NEXT: vmov.16 q4[5], r1 -; CHECK-NEXT: vmov.u8 r1, q1[6] -; CHECK-NEXT: vmov.16 q4[6], r1 -; CHECK-NEXT: vmov.u8 r1, q1[7] -; CHECK-NEXT: vmov.16 q4[7], r1 -; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: vmov.u8 r1, q1[8] +; CHECK-NEXT: vmovlb.s8 q2, q2 +; CHECK-NEXT: vpsel q2, q2, q3 +; CHECK-NEXT: vmov.16 q3[0], r1 +; CHECK-NEXT: vmov.u8 r1, q1[9] +; CHECK-NEXT: vmov.16 q3[1], r1 +; CHECK-NEXT: vmov.u8 r1, q1[10] +; CHECK-NEXT: vmov.16 q3[2], r1 +; CHECK-NEXT: vmov.u8 r1, q1[11] +; CHECK-NEXT: vmov.16 q3[3], r1 +; CHECK-NEXT: vmov.u8 r1, q1[12] +; CHECK-NEXT: vmov.16 q3[4], r1 +; CHECK-NEXT: vmov.u8 r1, q1[13] +; CHECK-NEXT: vmov.16 q3[5], r1 +; CHECK-NEXT: vmov.u8 r1, q1[14] +; CHECK-NEXT: vmov.16 q3[6], r1 +; CHECK-NEXT: vmov.u8 r1, q1[15] +; CHECK-NEXT: vmov.16 q3[7], r1 +; CHECK-NEXT: vmov.u8 r1, q0[8] ; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: vmov.u8 r1, q0[9] ; CHECK-NEXT: vmov.16 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: vmov.u8 r1, q0[10] ; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: vmov.u8 r1, q0[11] ; CHECK-NEXT: vmov.16 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.u8 r1, q0[12] ; CHECK-NEXT: vmov.16 q1[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: vmov.u8 r1, q0[13] ; CHECK-NEXT: vmov.16 q1[5], r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: vmov.16 q1[6], r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmovlb.s8 q3, q2 -; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.u8 r1, q0[15] ; CHECK-NEXT: vmov.16 q1[7], r1 -; CHECK-NEXT: vpsel q3, q3, q2 -; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vmovlb.s8 q0, q1 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vadd.i16 q0, q0, q3 -; CHECK-NEXT: vaddva.u16 r0, q0 +; CHECK-NEXT: vpt.i16 ne, q3, zr +; CHECK-NEXT: vaddt.i16 q2, q2, q0 +; CHECK-NEXT: vaddva.u16 r0, q2 ; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %c = icmp eq <16 x i8> %b, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll index e02f3f6..decd0dc 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -223,74 +223,71 @@ entry: define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { ; CHECK-LABEL: add_v8i16_v8i32_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov.i8 q3, #0x0 ; CHECK-NEXT: vmov.i8 q4, #0xff ; CHECK-NEXT: vcmp.i16 eq, q2, zr ; CHECK-NEXT: vpsel q2, q4, q3 -; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.u16 r0, q2[0] ; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.u16 r0, q2[1] ; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.u16 r0, q2[2] ; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.u16 r0, q2[3] ; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.u16 r0, q1[0] ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.u16 r0, q1[1] ; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.u16 r0, q1[2] ; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r0, q1[3] ; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmovlb.u16 q5, q3 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmovlb.u16 q4, q3 ; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.u16 r0, q0[1] ; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r0, q0[3] ; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmovlb.u16 q6, q3 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmovlb.u16 q5, q3 ; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q4, q6, q5 -; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.32 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmovlb.u16 q1, q2 +; CHECK-NEXT: vmult.i32 q3, q5, q4 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] ; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.u16 r0, q1[5] ; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.u16 r0, q1[6] ; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.u16 r0, q1[7] ; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmovlb.u16 q0, q2 -; CHECK-NEXT: vpt.i32 ne, q5, zr -; CHECK-NEXT: vmult.i32 q3, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q3, q4 -; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmullb.u16 q0, q1, q2 +; CHECK-NEXT: vpt.i32 ne, q4, zr +; CHECK-NEXT: vaddt.i32 q3, q3, q0 +; CHECK-NEXT: vaddv.u32 r0, q3 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i16> %b, zeroinitializer @@ -305,74 +302,71 @@ entry: define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { ; CHECK-LABEL: add_v8i16_v8i32_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov.i8 q3, #0x0 ; CHECK-NEXT: vmov.i8 q4, #0xff ; CHECK-NEXT: vcmp.i16 eq, q2, zr ; CHECK-NEXT: vpsel q2, q4, q3 -; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.u16 r0, q2[0] ; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.u16 r0, q2[1] ; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.u16 r0, q2[2] ; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.u16 r0, q2[3] ; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.u16 r0, q1[0] ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.u16 r0, q1[1] ; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.u16 r0, q1[2] ; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r0, q1[3] ; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmovlb.s16 q5, q3 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmovlb.s16 q4, q3 ; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.u16 r0, q0[1] ; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r0, q0[3] ; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmovlb.s16 q6, q3 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmovlb.s16 q5, q3 ; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q4, q6, q5 -; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.32 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmovlb.s16 q1, q2 +; CHECK-NEXT: vmult.i32 q3, q5, q4 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] ; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.u16 r0, q1[5] ; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.u16 r0, q1[6] ; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.u16 r0, q1[7] ; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmovlb.s16 q0, q2 -; CHECK-NEXT: vpt.i32 ne, q5, zr -; CHECK-NEXT: vmult.i32 q3, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q3, q4 -; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmullb.s16 q0, q1, q2 +; CHECK-NEXT: vpt.i32 ne, q4, zr +; CHECK-NEXT: vaddt.i32 q3, q3, q0 +; CHECK-NEXT: vaddv.u32 r0, q3 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i16> %b, zeroinitializer @@ -908,180 +902,180 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #64 ; CHECK-NEXT: sub sp, #64 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vcmp.i8 eq, q2, zr -; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vcmp.i8 eq, q3, zr +; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: vmov.i8 q0, #0x0 -; CHECK-NEXT: vpsel q7, q2, q0 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmov.u8 r0, q7[8] -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q7[9] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q7[10] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q7[11] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q7[12] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q7[13] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q7[14] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q7[15] -; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vpsel q1, q3, q0 +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[4] +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.16 q3[7], r0 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vcmp.i16 ne, q2, zr -; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: vpsel q4, q4, q0 -; CHECK-NEXT: vmov.u16 r0, q4[4] +; CHECK-NEXT: vcmp.i16 ne, q3, zr +; CHECK-NEXT: vmov.i32 q3, #0xff +; CHECK-NEXT: vpsel q5, q5, q0 +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.u16 r0, q5[4] ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] +; CHECK-NEXT: vmov.u16 r0, q5[5] ; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] +; CHECK-NEXT: vmov.u16 r0, q5[6] ; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] +; CHECK-NEXT: vmov.u16 r0, q5[7] ; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.u8 r0, q2[4] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.u8 r0, q2[5] ; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.u8 r0, q2[6] ; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.u8 r0, q2[7] ; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q3[12] +; CHECK-NEXT: vmov.u8 r0, q4[4] ; CHECK-NEXT: vmov.32 q6[0], r0 -; CHECK-NEXT: vmov.u8 r0, q3[13] +; CHECK-NEXT: vmov.u8 r0, q4[5] ; CHECK-NEXT: vmov.32 q6[1], r0 -; CHECK-NEXT: vmov.u8 r0, q3[14] +; CHECK-NEXT: vmov.u8 r0, q4[6] ; CHECK-NEXT: vmov.32 q6[2], r0 -; CHECK-NEXT: vmov.u8 r0, q3[15] +; CHECK-NEXT: vmov.u8 r0, q4[7] ; CHECK-NEXT: vmov.32 q6[3], r0 -; CHECK-NEXT: vand q5, q0, q2 -; CHECK-NEXT: vand q0, q6, q2 -; CHECK-NEXT: vmov.i32 q6, #0x0 -; CHECK-NEXT: vmov.u8 r0, q7[0] -; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q6, q0, q5 +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vand q7, q0, q3 ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q7[1] -; CHECK-NEXT: vldrw.u32 q5, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vand q3, q6, q3 ; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q7[2] +; CHECK-NEXT: vmov.u8 r0, q1[10] ; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q7[3] +; CHECK-NEXT: vmov.u8 r0, q1[11] ; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q7[4] +; CHECK-NEXT: vmov.u8 r0, q1[12] ; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q7[5] +; CHECK-NEXT: vmov.u8 r0, q1[13] ; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q7[6] +; CHECK-NEXT: vmov.u8 r0, q1[14] ; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q7[7] +; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.i32 q6, #0x0 ; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i32 q6, q3, q7 ; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vpsel q5, q5, q0 -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q5[5] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q5[6] -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q5[7] -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q3[4] -; CHECK-NEXT: vmov.32 q7[0], r0 -; CHECK-NEXT: vmov.u8 r0, q3[5] +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q2[12] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q2[13] +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q2[14] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q2[15] +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q4[12] +; CHECK-NEXT: vmov.32 q7[0], r0 +; CHECK-NEXT: vmov.u8 r0, q4[13] ; CHECK-NEXT: vmov.32 q7[1], r0 -; CHECK-NEXT: vmov.u8 r0, q3[6] +; CHECK-NEXT: vmov.u8 r0, q4[14] ; CHECK-NEXT: vmov.32 q7[2], r0 -; CHECK-NEXT: vmov.u8 r0, q3[7] +; CHECK-NEXT: vmov.u8 r0, q4[15] ; CHECK-NEXT: vmov.32 q7[3], r0 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q7, q7, q2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q6, q7, q0 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: vadd.i32 q0, q6, q0 -; CHECK-NEXT: vmov.32 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.32 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.32 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.32 q6[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q3[8] -; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vand q6, q4, q2 -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q3[9] -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q3[10] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q3[11] -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vand q7, q4, q2 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vand q7, q7, q3 ; CHECK-NEXT: vmov.u16 r0, q5[0] -; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmul.i32 q1, q7, q1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q4, q7, q6 -; CHECK-NEXT: vmov.32 q6[0], r0 +; CHECK-NEXT: vaddt.i32 q6, q6, q1 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u16 r0, q5[1] -; CHECK-NEXT: vmov.32 q6[1], r0 +; CHECK-NEXT: vmov.32 q1[1], r0 ; CHECK-NEXT: vmov.u16 r0, q5[2] -; CHECK-NEXT: vmov.32 q6[2], r0 +; CHECK-NEXT: vmov.32 q1[2], r0 ; CHECK-NEXT: vmov.u16 r0, q5[3] -; CHECK-NEXT: vmov.32 q6[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.32 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q3[0] -; CHECK-NEXT: vand q1, q5, q2 +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q2[0] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q2[1] +; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q2[3] +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q4[0] ; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q3[1] +; CHECK-NEXT: vmov.u8 r0, q4[1] ; CHECK-NEXT: vmov.32 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q3[2] +; CHECK-NEXT: vmov.u8 r0, q4[2] ; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q3[3] +; CHECK-NEXT: vmov.u8 r0, q4[3] ; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vand q2, q5, q2 -; CHECK-NEXT: vpt.i32 ne, q6, zr -; CHECK-NEXT: vmult.i32 q0, q2, q1 -; CHECK-NEXT: vadd.i32 q1, q0, q4 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vand q5, q5, q3 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i32 q7, q5, q1 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q2[8] +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q2[9] +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.u8 r0, q2[10] +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.u8 r0, q2[11] +; CHECK-NEXT: vmov.32 q0[3], r0 +; CHECK-NEXT: vmov.u8 r0, q4[8] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q4[9] +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q4[10] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q4[11] +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vmov q1, q7 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i32 q1, q7, q0 +; CHECK-NEXT: vadd.i32 q0, q1, q6 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -1101,191 +1095,185 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr -; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: vmov.i8 q0, #0xff -; CHECK-NEXT: vpsel q6, q0, q2 +; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q7, q2, q0 ; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmov.u8 r0, q6[8] -; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.u8 r0, q7[0] +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q6[9] +; CHECK-NEXT: vmov.u8 r0, q7[1] ; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q6[10] +; CHECK-NEXT: vmov.u8 r0, q7[2] ; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q6[11] +; CHECK-NEXT: vmov.u8 r0, q7[3] ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q6[12] +; CHECK-NEXT: vmov.u8 r0, q7[4] ; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q6[13] +; CHECK-NEXT: vmov.u8 r0, q7[5] ; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q6[14] +; CHECK-NEXT: vmov.u8 r0, q7[6] ; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q6[15] +; CHECK-NEXT: vmov.u8 r0, q7[7] ; CHECK-NEXT: vmov.16 q2[7], r0 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vcmp.i16 ne, q2, zr -; CHECK-NEXT: vpsel q7, q0, q4 -; CHECK-NEXT: vmov.u16 r0, q7[4] +; CHECK-NEXT: vpsel q4, q4, q0 +; CHECK-NEXT: vmov.u16 r0, q4[4] ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q7[5] +; CHECK-NEXT: vmov.u16 r0, q4[5] ; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q7[6] +; CHECK-NEXT: vmov.u16 r0, q4[6] ; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q7[7] +; CHECK-NEXT: vmov.u16 r0, q4[7] ; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.u8 r0, q1[4] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.u8 r0, q1[5] ; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.u8 r0, q1[6] ; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.u8 r0, q1[7] ; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q3[12] -; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q3[13] -; CHECK-NEXT: vmov.32 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q3[14] -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q3[15] -; CHECK-NEXT: vmov.32 q5[3], r0 +; CHECK-NEXT: vmov.u8 r0, q3[4] +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.u8 r0, q3[5] +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.u8 r0, q3[6] +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.u8 r0, q3[7] +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q4, q0 -; CHECK-NEXT: vmovlb.s8 q5, q5 -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vmovlb.s16 q2, q5 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmovlb.s8 q2, q2 +; CHECK-NEXT: vmovlb.s16 q5, q0 +; CHECK-NEXT: vmovlb.s16 q0, q2 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov q6, q2 +; CHECK-NEXT: vmov.u8 r0, q7[8] ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q0, q2, q4 -; CHECK-NEXT: vmov.u8 r0, q6[0] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmult.i32 q6, q0, q5 ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q6[1] +; CHECK-NEXT: vmov.u8 r0, q7[9] +; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q6[2] +; CHECK-NEXT: vmov.u8 r0, q7[10] ; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q6[3] +; CHECK-NEXT: vmov.u8 r0, q7[11] ; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q6[4] +; CHECK-NEXT: vmov.u8 r0, q7[12] ; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q6[5] +; CHECK-NEXT: vmov.u8 r0, q7[13] ; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q6[6] +; CHECK-NEXT: vmov.u8 r0, q7[14] ; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q6[7] +; CHECK-NEXT: vmov.u8 r0, q7[15] ; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vcmp.i16 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vpsel q4, q2, q0 -; CHECK-NEXT: vmov.u16 r0, q4[4] +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vpsel q5, q5, q0 +; CHECK-NEXT: vmov.u16 r0, q5[4] ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] +; CHECK-NEXT: vmov.u16 r0, q5[5] ; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] +; CHECK-NEXT: vmov.u16 r0, q5[6] ; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] +; CHECK-NEXT: vmov.u16 r0, q5[7] ; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] +; CHECK-NEXT: vmov.u8 r0, q1[12] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.u8 r0, q1[13] ; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.u8 r0, q1[14] ; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.u8 r0, q1[15] ; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q3[4] -; CHECK-NEXT: vmov.32 q6[0], r0 -; CHECK-NEXT: vmov.u8 r0, q3[5] -; CHECK-NEXT: vmov.32 q6[1], r0 -; CHECK-NEXT: vmov.u8 r0, q3[6] -; CHECK-NEXT: vmov.32 q6[2], r0 -; CHECK-NEXT: vmov.u8 r0, q3[7] -; CHECK-NEXT: vmov.32 q6[3], r0 +; CHECK-NEXT: vmov.u8 r0, q3[12] +; CHECK-NEXT: vmov.32 q7[0], r0 +; CHECK-NEXT: vmov.u8 r0, q3[13] +; CHECK-NEXT: vmov.32 q7[1], r0 +; CHECK-NEXT: vmov.u8 r0, q3[14] +; CHECK-NEXT: vmov.32 q7[2], r0 +; CHECK-NEXT: vmov.u8 r0, q3[15] +; CHECK-NEXT: vmov.32 q7[3], r0 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s8 q6, q6 +; CHECK-NEXT: vmovlb.s8 q7, q7 ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmovlb.s16 q6, q6 +; CHECK-NEXT: vmovlb.s16 q7, q7 +; CHECK-NEXT: vmov.u16 r0, q4[0] +; CHECK-NEXT: vmul.i32 q0, q7, q0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q5, q6, q0 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.u16 r0, q7[0] -; CHECK-NEXT: vadd.i32 q6, q5, q0 +; CHECK-NEXT: vaddt.i32 q6, q6, q0 ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q7[1] +; CHECK-NEXT: vmov.u16 r0, q4[1] ; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q7[2] +; CHECK-NEXT: vmov.u16 r0, q4[2] ; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q7[3] +; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.32 q0[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.32 q0[3], r0 +; CHECK-NEXT: vmov.u8 r0, q3[0] +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.u8 r0, q3[1] +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.u8 r0, q3[2] +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.u8 r0, q3[3] +; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vmovlb.s8 q4, q4 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmov.u16 r0, q5[0] +; CHECK-NEXT: vmovlb.s16 q4, q4 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i32 q2, q4, q0 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.u16 r0, q5[1] +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.u16 r0, q5[2] +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.u16 r0, q5[3] ; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: vmov.u8 r0, q1[8] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmov.32 q0[1], r0 ; CHECK-NEXT: vmov.u8 r0, q1[10] ; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: vmov.u8 r0, q1[11] ; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: vmov.u8 r0, q3[8] -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q2, q0 -; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u8 r0, q3[9] -; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.32 q1[1], r0 ; CHECK-NEXT: vmov.u8 r0, q3[10] -; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.32 q1[2], r0 ; CHECK-NEXT: vmov.u8 r0, q3[11] -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[0] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q5, q0 -; CHECK-NEXT: vmov q0, q7 -; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q0, q5, q2 -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q3[0] -; CHECK-NEXT: vmovlb.s8 q1, q2 -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q3[1] +; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q3[2] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q3[3] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov q3, q7 -; CHECK-NEXT: vmovlb.s8 q2, q2 -; CHECK-NEXT: vmovlb.s16 q2, q2 +; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q3, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q3, q0 -; CHECK-NEXT: vadd.i32 q0, q0, q6 +; CHECK-NEXT: vaddt.i32 q2, q2, q0 +; CHECK-NEXT: vadd.i32 q0, q2, q6 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1349,123 +1337,120 @@ entry: define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { ; CHECK-LABEL: add_v16i8_v16i16_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vcmp.i8 eq, q2, zr ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vmov.u8 r0, q2[8] +; CHECK-NEXT: vmov.u8 r0, q2[0] ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] +; CHECK-NEXT: vmov.u8 r0, q2[1] ; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] +; CHECK-NEXT: vmov.u8 r0, q2[2] ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[11] +; CHECK-NEXT: vmov.u8 r0, q2[3] ; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] +; CHECK-NEXT: vmov.u8 r0, q2[4] ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] +; CHECK-NEXT: vmov.u8 r0, q2[5] ; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] +; CHECK-NEXT: vmov.u8 r0, q2[6] ; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] +; CHECK-NEXT: vmov.u8 r0, q2[7] ; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.u8 r0, q1[0] ; CHECK-NEXT: vcmp.i16 ne, q3, zr ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.u8 r0, q1[1] ; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.u8 r0, q1[2] ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.u8 r0, q1[3] ; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.u8 r0, q1[4] ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.u8 r0, q1[5] ; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.u8 r0, q1[6] ; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.u8 r0, q1[7] ; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmovlb.u8 q5, q3 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmovlb.u8 q4, q3 ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.u8 r0, q0[1] ; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.u8 r0, q0[2] ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.u8 r0, q0[3] ; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.u8 r0, q0[4] ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.u8 r0, q0[5] ; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.u8 r0, q0[6] ; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.u8 r0, q0[7] ; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmovlb.u8 q6, q3 +; CHECK-NEXT: vmov.u8 r0, q2[8] +; CHECK-NEXT: vmovlb.u8 q5, q3 ; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i16 q4, q6, q5 -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmovlb.u8 q1, q2 +; CHECK-NEXT: vmult.i16 q3, q5, q4 +; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmov.u8 r0, q2[9] +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov.u8 r0, q2[10] +; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov.u8 r0, q2[11] +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov.u8 r0, q2[12] +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.u8 r0, q2[13] +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vmov.u8 r0, q2[14] +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov.u8 r0, q2[15] +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmov.u8 r0, q1[8] ; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.u8 r0, q1[9] ; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.u8 r0, q1[10] ; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.u8 r0, q1[11] ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.u8 r0, q1[12] ; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.u8 r0, q1[13] ; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.u8 r0, q1[14] ; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.u8 r0, q1[15] ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmovlb.u8 q0, q2 -; CHECK-NEXT: vpt.i16 ne, q5, zr -; CHECK-NEXT: vmult.i16 q3, q0, q1 -; CHECK-NEXT: vadd.i16 q0, q3, q4 -; CHECK-NEXT: vaddv.u16 r0, q0 +; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.16 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.16 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmullb.u8 q0, q1, q2 +; CHECK-NEXT: vpt.i16 ne, q4, zr +; CHECK-NEXT: vaddt.i16 q3, q3, q0 +; CHECK-NEXT: vaddv.u16 r0, q3 ; CHECK-NEXT: uxth r0, r0 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %c = icmp eq <16 x i8> %b, zeroinitializer @@ -1480,123 +1465,120 @@ entry: define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { ; CHECK-LABEL: add_v16i8_v16i16_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vcmp.i8 eq, q2, zr ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vmov.u8 r0, q2[8] +; CHECK-NEXT: vmov.u8 r0, q2[0] ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] +; CHECK-NEXT: vmov.u8 r0, q2[1] ; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] +; CHECK-NEXT: vmov.u8 r0, q2[2] ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[11] +; CHECK-NEXT: vmov.u8 r0, q2[3] ; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] +; CHECK-NEXT: vmov.u8 r0, q2[4] ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] +; CHECK-NEXT: vmov.u8 r0, q2[5] ; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] +; CHECK-NEXT: vmov.u8 r0, q2[6] ; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] +; CHECK-NEXT: vmov.u8 r0, q2[7] ; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.u8 r0, q1[0] ; CHECK-NEXT: vcmp.i16 ne, q3, zr ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.u8 r0, q1[1] ; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.u8 r0, q1[2] ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.u8 r0, q1[3] ; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.u8 r0, q1[4] ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.u8 r0, q1[5] ; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.u8 r0, q1[6] ; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.u8 r0, q1[7] ; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmovlb.s8 q5, q3 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmovlb.s8 q4, q3 ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.u8 r0, q0[1] ; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.u8 r0, q0[2] ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.u8 r0, q0[3] ; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmovlb.s8 q6, q3 -; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i16 q4, q6, q5 -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmovlb.s8 q1, q2 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.u8 r0, q2[8] +; CHECK-NEXT: vmovlb.s8 q5, q3 +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i16 q3, q5, q4 +; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmov.u8 r0, q2[9] +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov.u8 r0, q2[10] +; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov.u8 r0, q2[11] +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov.u8 r0, q2[12] +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.u8 r0, q2[13] +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vmov.u8 r0, q2[14] +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov.u8 r0, q2[15] +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmov.u8 r0, q1[8] ; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.u8 r0, q1[9] ; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.u8 r0, q1[10] ; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.u8 r0, q1[11] ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.u8 r0, q1[12] ; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.u8 r0, q1[13] ; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.u8 r0, q1[14] ; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.u8 r0, q1[15] ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmovlb.s8 q0, q2 -; CHECK-NEXT: vpt.i16 ne, q5, zr -; CHECK-NEXT: vmult.i16 q3, q0, q1 -; CHECK-NEXT: vadd.i16 q0, q3, q4 -; CHECK-NEXT: vaddv.u16 r0, q0 +; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.16 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.16 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmullb.s8 q0, q1, q2 +; CHECK-NEXT: vpt.i16 ne, q4, zr +; CHECK-NEXT: vaddt.i16 q3, q3, q0 +; CHECK-NEXT: vaddv.u16 r0, q3 ; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %c = icmp eq <16 x i8> %b, zeroinitializer @@ -2825,74 +2807,71 @@ entry: define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) { ; CHECK-LABEL: add_v8i16_v8i32_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov.i8 q3, #0x0 ; CHECK-NEXT: vmov.i8 q4, #0xff ; CHECK-NEXT: vcmp.i16 eq, q2, zr ; CHECK-NEXT: vpsel q2, q4, q3 -; CHECK-NEXT: vmov.u16 r1, q2[4] +; CHECK-NEXT: vmov.u16 r1, q2[0] ; CHECK-NEXT: vmov.32 q3[0], r1 -; CHECK-NEXT: vmov.u16 r1, q2[5] +; CHECK-NEXT: vmov.u16 r1, q2[1] ; CHECK-NEXT: vmov.32 q3[1], r1 -; CHECK-NEXT: vmov.u16 r1, q2[6] +; CHECK-NEXT: vmov.u16 r1, q2[2] ; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vmov.u16 r1, q2[7] +; CHECK-NEXT: vmov.u16 r1, q2[3] ; CHECK-NEXT: vmov.32 q3[3], r1 -; CHECK-NEXT: vmov.u16 r1, q1[4] +; CHECK-NEXT: vmov.u16 r1, q1[0] ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmov.32 q3[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[5] +; CHECK-NEXT: vmov.u16 r1, q1[1] ; CHECK-NEXT: vmov.32 q3[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[6] +; CHECK-NEXT: vmov.u16 r1, q1[2] ; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[7] +; CHECK-NEXT: vmov.u16 r1, q1[3] ; CHECK-NEXT: vmov.32 q3[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmovlb.u16 q5, q3 +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vmovlb.u16 q4, q3 ; CHECK-NEXT: vmov.32 q3[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.u16 r1, q0[1] ; CHECK-NEXT: vmov.32 q3[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vmov.u16 r1, q0[2] ; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[3] ; CHECK-NEXT: vmov.32 q3[3], r1 -; CHECK-NEXT: vmov.u16 r1, q2[0] -; CHECK-NEXT: vmovlb.u16 q6, q3 +; CHECK-NEXT: vmov.u16 r1, q2[4] +; CHECK-NEXT: vmovlb.u16 q5, q3 ; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q4, q6, q5 -; CHECK-NEXT: vmov.32 q5[0], r1 -; CHECK-NEXT: vmov.u16 r1, q2[1] -; CHECK-NEXT: vmov.32 q5[1], r1 -; CHECK-NEXT: vmov.u16 r1, q2[2] -; CHECK-NEXT: vmov.32 q5[2], r1 -; CHECK-NEXT: vmov.u16 r1, q2[3] -; CHECK-NEXT: vmov.32 q5[3], r1 -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmovlb.u16 q1, q2 +; CHECK-NEXT: vmult.i32 q3, q5, q4 +; CHECK-NEXT: vmov.32 q4[0], r1 +; CHECK-NEXT: vmov.u16 r1, q2[5] +; CHECK-NEXT: vmov.32 q4[1], r1 +; CHECK-NEXT: vmov.u16 r1, q2[6] +; CHECK-NEXT: vmov.32 q4[2], r1 +; CHECK-NEXT: vmov.u16 r1, q2[7] +; CHECK-NEXT: vmov.32 q4[3], r1 +; CHECK-NEXT: vmov.u16 r1, q1[4] ; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vmov.u16 r1, q1[5] ; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r1, q1[6] ; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r1, q1[7] ; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q2 -; CHECK-NEXT: vpt.i32 ne, q5, zr -; CHECK-NEXT: vmult.i32 q3, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q3, q4 -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov.32 q1[0], r1 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmullb.u16 q0, q1, q2 +; CHECK-NEXT: vpt.i32 ne, q4, zr +; CHECK-NEXT: vaddt.i32 q3, q3, q0 +; CHECK-NEXT: vaddva.u32 r0, q3 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i16> %b, zeroinitializer @@ -2908,74 +2887,71 @@ entry: define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) { ; CHECK-LABEL: add_v8i16_v8i32_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov.i8 q3, #0x0 ; CHECK-NEXT: vmov.i8 q4, #0xff ; CHECK-NEXT: vcmp.i16 eq, q2, zr ; CHECK-NEXT: vpsel q2, q4, q3 -; CHECK-NEXT: vmov.u16 r1, q2[4] +; CHECK-NEXT: vmov.u16 r1, q2[0] ; CHECK-NEXT: vmov.32 q3[0], r1 -; CHECK-NEXT: vmov.u16 r1, q2[5] +; CHECK-NEXT: vmov.u16 r1, q2[1] ; CHECK-NEXT: vmov.32 q3[1], r1 -; CHECK-NEXT: vmov.u16 r1, q2[6] +; CHECK-NEXT: vmov.u16 r1, q2[2] ; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vmov.u16 r1, q2[7] +; CHECK-NEXT: vmov.u16 r1, q2[3] ; CHECK-NEXT: vmov.32 q3[3], r1 -; CHECK-NEXT: vmov.u16 r1, q1[4] +; CHECK-NEXT: vmov.u16 r1, q1[0] ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmov.32 q3[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[5] +; CHECK-NEXT: vmov.u16 r1, q1[1] ; CHECK-NEXT: vmov.32 q3[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[6] +; CHECK-NEXT: vmov.u16 r1, q1[2] ; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[7] +; CHECK-NEXT: vmov.u16 r1, q1[3] ; CHECK-NEXT: vmov.32 q3[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmovlb.s16 q5, q3 +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vmovlb.s16 q4, q3 ; CHECK-NEXT: vmov.32 q3[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.u16 r1, q0[1] ; CHECK-NEXT: vmov.32 q3[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vmov.u16 r1, q0[2] ; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[3] ; CHECK-NEXT: vmov.32 q3[3], r1 -; CHECK-NEXT: vmov.u16 r1, q2[0] -; CHECK-NEXT: vmovlb.s16 q6, q3 +; CHECK-NEXT: vmov.u16 r1, q2[4] +; CHECK-NEXT: vmovlb.s16 q5, q3 ; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q4, q6, q5 -; CHECK-NEXT: vmov.32 q5[0], r1 -; CHECK-NEXT: vmov.u16 r1, q2[1] -; CHECK-NEXT: vmov.32 q5[1], r1 -; CHECK-NEXT: vmov.u16 r1, q2[2] -; CHECK-NEXT: vmov.32 q5[2], r1 -; CHECK-NEXT: vmov.u16 r1, q2[3] -; CHECK-NEXT: vmov.32 q5[3], r1 -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmovlb.s16 q1, q2 +; CHECK-NEXT: vmult.i32 q3, q5, q4 +; CHECK-NEXT: vmov.32 q4[0], r1 +; CHECK-NEXT: vmov.u16 r1, q2[5] +; CHECK-NEXT: vmov.32 q4[1], r1 +; CHECK-NEXT: vmov.u16 r1, q2[6] +; CHECK-NEXT: vmov.32 q4[2], r1 +; CHECK-NEXT: vmov.u16 r1, q2[7] +; CHECK-NEXT: vmov.32 q4[3], r1 +; CHECK-NEXT: vmov.u16 r1, q1[4] ; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vmov.u16 r1, q1[5] ; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r1, q1[6] ; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r1, q1[7] ; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q2 -; CHECK-NEXT: vpt.i32 ne, q5, zr -; CHECK-NEXT: vmult.i32 q3, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q3, q4 -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov.32 q1[0], r1 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmullb.s16 q0, q1, q2 +; CHECK-NEXT: vpt.i32 ne, q4, zr +; CHECK-NEXT: vaddt.i32 q3, q3, q0 +; CHECK-NEXT: vaddva.u32 r0, q3 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i16> %b, zeroinitializer @@ -3533,86 +3509,36 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #64 ; CHECK-NEXT: sub sp, #64 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vcmp.i8 eq, q2, zr -; CHECK-NEXT: vmov.i8 q2, #0xff -; CHECK-NEXT: vmov.i8 q0, #0x0 -; CHECK-NEXT: vpsel q7, q2, q0 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmov.u8 r1, q7[8] -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q7[9] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q7[10] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q7[11] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q7[12] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u8 r1, q7[13] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u8 r1, q7[14] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u8 r1, q7[15] -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vcmp.i16 ne, q2, zr -; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: vpsel q4, q4, q0 -; CHECK-NEXT: vmov.u16 r1, q4[4] -; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: vmov.u16 r1, q4[5] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov.u16 r1, q4[6] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.u16 r1, q4[7] -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmov.u8 r1, q1[12] -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[13] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov.u8 r1, q1[14] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.u8 r1, q1[15] -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmov.u8 r1, q3[12] -; CHECK-NEXT: vmov.32 q6[0], r1 -; CHECK-NEXT: vmov.u8 r1, q3[13] -; CHECK-NEXT: vmov.32 q6[1], r1 -; CHECK-NEXT: vmov.u8 r1, q3[14] -; CHECK-NEXT: vmov.32 q6[2], r1 -; CHECK-NEXT: vmov.u8 r1, q3[15] -; CHECK-NEXT: vmov.32 q6[3], r1 -; CHECK-NEXT: vand q5, q0, q2 -; CHECK-NEXT: vand q0, q6, q2 -; CHECK-NEXT: vmov.i32 q6, #0x0 -; CHECK-NEXT: vmov.u8 r1, q7[0] -; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q6, q0, q5 -; CHECK-NEXT: vmov.16 q0[0], r1 -; CHECK-NEXT: vmov.u8 r1, q7[1] -; CHECK-NEXT: vldrw.u32 q5, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov.u8 r1, q7[2] -; CHECK-NEXT: vmov.16 q0[2], r1 -; CHECK-NEXT: vmov.u8 r1, q7[3] -; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: vmov.u8 r1, q7[4] -; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov.u8 r1, q7[5] -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.u8 r1, q7[6] -; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: vmov.u8 r1, q7[7] -; CHECK-NEXT: vmov.16 q0[7], r1 -; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill -; CHECK-NEXT: vcmp.i16 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vcmp.i8 eq, q3, zr +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vpsel q1, q3, q0 +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vmov.u8 r1, q1[0] +; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q3[0], r1 +; CHECK-NEXT: vmov.u8 r1, q1[1] +; CHECK-NEXT: vmov.16 q3[1], r1 +; CHECK-NEXT: vmov.u8 r1, q1[2] +; CHECK-NEXT: vmov.16 q3[2], r1 +; CHECK-NEXT: vmov.u8 r1, q1[3] +; CHECK-NEXT: vmov.16 q3[3], r1 +; CHECK-NEXT: vmov.u8 r1, q1[4] +; CHECK-NEXT: vmov.16 q3[4], r1 +; CHECK-NEXT: vmov.u8 r1, q1[5] +; CHECK-NEXT: vmov.16 q3[5], r1 +; CHECK-NEXT: vmov.u8 r1, q1[6] +; CHECK-NEXT: vmov.16 q3[6], r1 +; CHECK-NEXT: vmov.u8 r1, q1[7] +; CHECK-NEXT: vmov.16 q3[7], r1 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vcmp.i16 ne, q3, zr +; CHECK-NEXT: vmov.i32 q3, #0xff ; CHECK-NEXT: vpsel q5, q5, q0 +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill ; CHECK-NEXT: vmov.u16 r1, q5[4] ; CHECK-NEXT: vmov.32 q0[0], r1 ; CHECK-NEXT: vmov.u16 r1, q5[5] @@ -3621,92 +3547,142 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.32 q0[2], r1 ; CHECK-NEXT: vmov.u16 r1, q5[7] ; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmov.u8 r1, q1[4] +; CHECK-NEXT: vmov.u8 r1, q2[4] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[5] +; CHECK-NEXT: vmov.u8 r1, q2[5] ; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov.u8 r1, q1[6] +; CHECK-NEXT: vmov.u8 r1, q2[6] ; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.u8 r1, q1[7] +; CHECK-NEXT: vmov.u8 r1, q2[7] ; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmov.u8 r1, q3[4] -; CHECK-NEXT: vmov.32 q7[0], r1 -; CHECK-NEXT: vmov.u8 r1, q3[5] -; CHECK-NEXT: vmov.32 q7[1], r1 -; CHECK-NEXT: vmov.u8 r1, q3[6] -; CHECK-NEXT: vmov.32 q7[2], r1 -; CHECK-NEXT: vmov.u8 r1, q3[7] -; CHECK-NEXT: vmov.32 q7[3], r1 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q7, q7, q2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q6, q7, q0 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.u16 r1, q4[0] -; CHECK-NEXT: vadd.i32 q0, q6, q0 +; CHECK-NEXT: vmov.u8 r1, q4[4] ; CHECK-NEXT: vmov.32 q6[0], r1 -; CHECK-NEXT: vmov.u16 r1, q4[1] -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.u8 r1, q4[5] ; CHECK-NEXT: vmov.32 q6[1], r1 -; CHECK-NEXT: vmov.u16 r1, q4[2] +; CHECK-NEXT: vmov.u8 r1, q4[6] ; CHECK-NEXT: vmov.32 q6[2], r1 -; CHECK-NEXT: vmov.u16 r1, q4[3] +; CHECK-NEXT: vmov.u8 r1, q4[7] ; CHECK-NEXT: vmov.32 q6[3], r1 ; CHECK-NEXT: vmov.u8 r1, q1[8] -; CHECK-NEXT: vmov.32 q4[0], r1 +; CHECK-NEXT: vand q7, q0, q3 +; CHECK-NEXT: vmov.16 q0[0], r1 ; CHECK-NEXT: vmov.u8 r1, q1[9] -; CHECK-NEXT: vmov.32 q4[1], r1 +; CHECK-NEXT: vand q3, q6, q3 +; CHECK-NEXT: vmov.16 q0[1], r1 ; CHECK-NEXT: vmov.u8 r1, q1[10] -; CHECK-NEXT: vmov.32 q4[2], r1 +; CHECK-NEXT: vmov.16 q0[2], r1 ; CHECK-NEXT: vmov.u8 r1, q1[11] -; CHECK-NEXT: vmov.32 q4[3], r1 -; CHECK-NEXT: vmov.u8 r1, q3[8] -; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vand q6, q4, q2 -; CHECK-NEXT: vmov.32 q4[0], r1 -; CHECK-NEXT: vmov.u8 r1, q3[9] -; CHECK-NEXT: vmov.32 q4[1], r1 -; CHECK-NEXT: vmov.u8 r1, q3[10] -; CHECK-NEXT: vmov.32 q4[2], r1 -; CHECK-NEXT: vmov.u8 r1, q3[11] -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.32 q4[3], r1 -; CHECK-NEXT: vand q7, q4, q2 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov.u8 r1, q1[12] +; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: vmov.u8 r1, q1[13] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.u8 r1, q1[14] +; CHECK-NEXT: vmov.16 q0[6], r1 +; CHECK-NEXT: vmov.u8 r1, q1[15] +; CHECK-NEXT: vmov.i32 q6, #0x0 +; CHECK-NEXT: vmov.16 q0[7], r1 +; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i32 q6, q3, q7 +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov.32 q1[0], r1 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmov.u8 r1, q2[12] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov.32 q1[0], r1 +; CHECK-NEXT: vmov.u8 r1, q2[13] +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.u8 r1, q2[14] +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.u8 r1, q2[15] +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmov.u8 r1, q4[12] +; CHECK-NEXT: vmov.32 q7[0], r1 +; CHECK-NEXT: vmov.u8 r1, q4[13] +; CHECK-NEXT: vmov.32 q7[1], r1 +; CHECK-NEXT: vmov.u8 r1, q4[14] +; CHECK-NEXT: vmov.32 q7[2], r1 +; CHECK-NEXT: vmov.u8 r1, q4[15] +; CHECK-NEXT: vmov.32 q7[3], r1 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vand q7, q7, q3 ; CHECK-NEXT: vmov.u16 r1, q5[0] -; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmul.i32 q1, q7, q1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q4, q7, q6 -; CHECK-NEXT: vmov.32 q6[0], r1 +; CHECK-NEXT: vaddt.i32 q6, q6, q1 +; CHECK-NEXT: vmov.32 q1[0], r1 ; CHECK-NEXT: vmov.u16 r1, q5[1] -; CHECK-NEXT: vmov.32 q6[1], r1 +; CHECK-NEXT: vmov.32 q1[1], r1 ; CHECK-NEXT: vmov.u16 r1, q5[2] -; CHECK-NEXT: vmov.32 q6[2], r1 +; CHECK-NEXT: vmov.32 q1[2], r1 ; CHECK-NEXT: vmov.u16 r1, q5[3] -; CHECK-NEXT: vmov.32 q6[3], r1 -; CHECK-NEXT: vmov.u8 r1, q1[0] -; CHECK-NEXT: vmov.32 q5[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[1] -; CHECK-NEXT: vmov.32 q5[1], r1 -; CHECK-NEXT: vmov.u8 r1, q1[2] -; CHECK-NEXT: vmov.32 q5[2], r1 -; CHECK-NEXT: vmov.u8 r1, q1[3] -; CHECK-NEXT: vmov.32 q5[3], r1 -; CHECK-NEXT: vmov.u8 r1, q3[0] -; CHECK-NEXT: vand q1, q5, q2 +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmov.u8 r1, q2[0] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov.32 q1[0], r1 +; CHECK-NEXT: vmov.u8 r1, q2[1] +; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.u8 r1, q2[2] +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.u8 r1, q2[3] +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmov.u8 r1, q4[0] ; CHECK-NEXT: vmov.32 q5[0], r1 -; CHECK-NEXT: vmov.u8 r1, q3[1] +; CHECK-NEXT: vmov.u8 r1, q4[1] ; CHECK-NEXT: vmov.32 q5[1], r1 -; CHECK-NEXT: vmov.u8 r1, q3[2] +; CHECK-NEXT: vmov.u8 r1, q4[2] ; CHECK-NEXT: vmov.32 q5[2], r1 -; CHECK-NEXT: vmov.u8 r1, q3[3] +; CHECK-NEXT: vmov.u8 r1, q4[3] ; CHECK-NEXT: vmov.32 q5[3], r1 -; CHECK-NEXT: vand q2, q5, q2 -; CHECK-NEXT: vpt.i32 ne, q6, zr -; CHECK-NEXT: vmult.i32 q0, q2, q1 -; CHECK-NEXT: vadd.i32 q1, q0, q4 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vand q5, q5, q3 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i32 q7, q5, q1 +; CHECK-NEXT: vmov.32 q1[0], r1 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmov.u8 r1, q2[8] +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.u8 r1, q2[9] +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.u8 r1, q2[10] +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.u8 r1, q2[11] +; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: vmov.u8 r1, q4[8] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov.32 q1[0], r1 +; CHECK-NEXT: vmov.u8 r1, q4[9] +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.u8 r1, q4[10] +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.u8 r1, q4[11] +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vmov q1, q7 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i32 q1, q7, q0 +; CHECK-NEXT: vadd.i32 q0, q1, q6 ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -3727,191 +3703,185 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr -; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: vmov.i8 q0, #0xff -; CHECK-NEXT: vpsel q6, q0, q2 +; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q7, q2, q0 ; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmov.u8 r1, q6[8] -; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.u8 r1, q7[0] +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q6[9] +; CHECK-NEXT: vmov.u8 r1, q7[1] ; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q6[10] +; CHECK-NEXT: vmov.u8 r1, q7[2] ; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q6[11] +; CHECK-NEXT: vmov.u8 r1, q7[3] ; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q6[12] +; CHECK-NEXT: vmov.u8 r1, q7[4] ; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u8 r1, q6[13] +; CHECK-NEXT: vmov.u8 r1, q7[5] ; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u8 r1, q6[14] +; CHECK-NEXT: vmov.u8 r1, q7[6] ; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u8 r1, q6[15] +; CHECK-NEXT: vmov.u8 r1, q7[7] ; CHECK-NEXT: vmov.16 q2[7], r1 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vcmp.i16 ne, q2, zr -; CHECK-NEXT: vpsel q7, q0, q4 -; CHECK-NEXT: vmov.u16 r1, q7[4] +; CHECK-NEXT: vpsel q4, q4, q0 +; CHECK-NEXT: vmov.u16 r1, q4[4] ; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: vmov.u16 r1, q7[5] +; CHECK-NEXT: vmov.u16 r1, q4[5] ; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov.u16 r1, q7[6] +; CHECK-NEXT: vmov.u16 r1, q4[6] ; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.u16 r1, q7[7] +; CHECK-NEXT: vmov.u16 r1, q4[7] ; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmov.u8 r1, q1[12] +; CHECK-NEXT: vmov.u8 r1, q1[4] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[13] +; CHECK-NEXT: vmov.u8 r1, q1[5] ; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov.u8 r1, q1[14] +; CHECK-NEXT: vmov.u8 r1, q1[6] ; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.u8 r1, q1[15] +; CHECK-NEXT: vmov.u8 r1, q1[7] ; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmov.u8 r1, q3[12] -; CHECK-NEXT: vmov.32 q5[0], r1 -; CHECK-NEXT: vmov.u8 r1, q3[13] -; CHECK-NEXT: vmov.32 q5[1], r1 -; CHECK-NEXT: vmov.u8 r1, q3[14] -; CHECK-NEXT: vmov.32 q5[2], r1 -; CHECK-NEXT: vmov.u8 r1, q3[15] -; CHECK-NEXT: vmov.32 q5[3], r1 +; CHECK-NEXT: vmov.u8 r1, q3[4] +; CHECK-NEXT: vmov.32 q2[0], r1 +; CHECK-NEXT: vmov.u8 r1, q3[5] +; CHECK-NEXT: vmov.32 q2[1], r1 +; CHECK-NEXT: vmov.u8 r1, q3[6] +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vmov.u8 r1, q3[7] +; CHECK-NEXT: vmov.32 q2[3], r1 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q4, q0 -; CHECK-NEXT: vmovlb.s8 q5, q5 -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vmovlb.s16 q2, q5 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmovlb.s8 q2, q2 +; CHECK-NEXT: vmovlb.s16 q5, q0 +; CHECK-NEXT: vmovlb.s16 q0, q2 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov q6, q2 +; CHECK-NEXT: vmov.u8 r1, q7[8] ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q0, q2, q4 -; CHECK-NEXT: vmov.u8 r1, q6[0] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmult.i32 q6, q0, q5 ; CHECK-NEXT: vmov.16 q0[0], r1 -; CHECK-NEXT: vmov.u8 r1, q6[1] +; CHECK-NEXT: vmov.u8 r1, q7[9] +; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov.u8 r1, q6[2] +; CHECK-NEXT: vmov.u8 r1, q7[10] ; CHECK-NEXT: vmov.16 q0[2], r1 -; CHECK-NEXT: vmov.u8 r1, q6[3] +; CHECK-NEXT: vmov.u8 r1, q7[11] ; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: vmov.u8 r1, q6[4] +; CHECK-NEXT: vmov.u8 r1, q7[12] ; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov.u8 r1, q6[5] +; CHECK-NEXT: vmov.u8 r1, q7[13] ; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.u8 r1, q6[6] +; CHECK-NEXT: vmov.u8 r1, q7[14] ; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: vmov.u8 r1, q6[7] +; CHECK-NEXT: vmov.u8 r1, q7[15] ; CHECK-NEXT: vmov.16 q0[7], r1 -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vcmp.i16 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vpsel q4, q2, q0 -; CHECK-NEXT: vmov.u16 r1, q4[4] +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vpsel q5, q5, q0 +; CHECK-NEXT: vmov.u16 r1, q5[4] ; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: vmov.u16 r1, q4[5] +; CHECK-NEXT: vmov.u16 r1, q5[5] ; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov.u16 r1, q4[6] +; CHECK-NEXT: vmov.u16 r1, q5[6] ; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.u16 r1, q4[7] +; CHECK-NEXT: vmov.u16 r1, q5[7] ; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmov.u8 r1, q1[4] +; CHECK-NEXT: vmov.u8 r1, q1[12] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[5] +; CHECK-NEXT: vmov.u8 r1, q1[13] ; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov.u8 r1, q1[6] +; CHECK-NEXT: vmov.u8 r1, q1[14] ; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.u8 r1, q1[7] +; CHECK-NEXT: vmov.u8 r1, q1[15] ; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmov.u8 r1, q3[4] -; CHECK-NEXT: vmov.32 q6[0], r1 -; CHECK-NEXT: vmov.u8 r1, q3[5] -; CHECK-NEXT: vmov.32 q6[1], r1 -; CHECK-NEXT: vmov.u8 r1, q3[6] -; CHECK-NEXT: vmov.32 q6[2], r1 -; CHECK-NEXT: vmov.u8 r1, q3[7] -; CHECK-NEXT: vmov.32 q6[3], r1 +; CHECK-NEXT: vmov.u8 r1, q3[12] +; CHECK-NEXT: vmov.32 q7[0], r1 +; CHECK-NEXT: vmov.u8 r1, q3[13] +; CHECK-NEXT: vmov.32 q7[1], r1 +; CHECK-NEXT: vmov.u8 r1, q3[14] +; CHECK-NEXT: vmov.32 q7[2], r1 +; CHECK-NEXT: vmov.u8 r1, q3[15] +; CHECK-NEXT: vmov.32 q7[3], r1 +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vmovlb.s8 q7, q7 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s16 q7, q7 +; CHECK-NEXT: vmov.u16 r1, q4[0] +; CHECK-NEXT: vmul.i32 q0, q7, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i32 q6, q6, q0 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.u16 r1, q4[1] +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.u16 r1, q4[2] +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.u16 r1, q4[3] +; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: vmov.u8 r1, q1[0] +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.u8 r1, q1[1] +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.u8 r1, q1[2] +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.u8 r1, q1[3] +; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: vmov.u8 r1, q3[0] +; CHECK-NEXT: vmov.32 q4[0], r1 +; CHECK-NEXT: vmov.u8 r1, q3[1] +; CHECK-NEXT: vmov.32 q4[1], r1 +; CHECK-NEXT: vmov.u8 r1, q3[2] +; CHECK-NEXT: vmov.32 q4[2], r1 +; CHECK-NEXT: vmov.u8 r1, q3[3] +; CHECK-NEXT: vmov.32 q4[3], r1 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s8 q6, q6 +; CHECK-NEXT: vmovlb.s8 q4, q4 ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmovlb.s16 q6, q6 +; CHECK-NEXT: vmov.u16 r1, q5[0] +; CHECK-NEXT: vmovlb.s16 q4, q4 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q5, q6, q0 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.u16 r1, q7[0] -; CHECK-NEXT: vadd.i32 q6, q5, q0 +; CHECK-NEXT: vmult.i32 q2, q4, q0 ; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: vmov.u16 r1, q7[1] +; CHECK-NEXT: vmov.u16 r1, q5[1] ; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov.u16 r1, q7[2] +; CHECK-NEXT: vmov.u16 r1, q5[2] ; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.u16 r1, q7[3] +; CHECK-NEXT: vmov.u16 r1, q5[3] ; CHECK-NEXT: vmov.32 q0[3], r1 ; CHECK-NEXT: vmov.u8 r1, q1[8] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov.32 q0[0], r1 ; CHECK-NEXT: vmov.u8 r1, q1[9] -; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmov.32 q0[1], r1 ; CHECK-NEXT: vmov.u8 r1, q1[10] ; CHECK-NEXT: vmov.32 q0[2], r1 ; CHECK-NEXT: vmov.u8 r1, q1[11] ; CHECK-NEXT: vmov.32 q0[3], r1 ; CHECK-NEXT: vmov.u8 r1, q3[8] -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q2, q0 -; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.32 q1[0], r1 ; CHECK-NEXT: vmov.u8 r1, q3[9] -; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.32 q1[1], r1 ; CHECK-NEXT: vmov.u8 r1, q3[10] -; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.32 q1[2], r1 ; CHECK-NEXT: vmov.u8 r1, q3[11] -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmov.u16 r1, q4[0] +; CHECK-NEXT: vmov.32 q1[3], r1 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q5, q0 -; CHECK-NEXT: vmov q0, q7 -; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q0, q5, q2 -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q4[1] -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q4[2] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q4[3] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q1[0] -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[1] -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q1[2] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q1[3] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q3[0] -; CHECK-NEXT: vmovlb.s8 q1, q2 -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q3[1] +; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q3[2] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q3[3] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov q3, q7 -; CHECK-NEXT: vmovlb.s8 q2, q2 -; CHECK-NEXT: vmovlb.s16 q2, q2 +; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q3, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q3, q0 -; CHECK-NEXT: vadd.i32 q0, q0, q6 +; CHECK-NEXT: vaddt.i32 q2, q2, q0 +; CHECK-NEXT: vadd.i32 q0, q2, q6 ; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -3978,123 +3948,120 @@ entry: define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) { ; CHECK-LABEL: add_v16i8_v16i16_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vcmp.i8 eq, q2, zr ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vmov.u8 r1, q2[8] +; CHECK-NEXT: vmov.u8 r1, q2[0] ; CHECK-NEXT: vmov.16 q3[0], r1 -; CHECK-NEXT: vmov.u8 r1, q2[9] +; CHECK-NEXT: vmov.u8 r1, q2[1] ; CHECK-NEXT: vmov.16 q3[1], r1 -; CHECK-NEXT: vmov.u8 r1, q2[10] +; CHECK-NEXT: vmov.u8 r1, q2[2] ; CHECK-NEXT: vmov.16 q3[2], r1 -; CHECK-NEXT: vmov.u8 r1, q2[11] +; CHECK-NEXT: vmov.u8 r1, q2[3] ; CHECK-NEXT: vmov.16 q3[3], r1 -; CHECK-NEXT: vmov.u8 r1, q2[12] +; CHECK-NEXT: vmov.u8 r1, q2[4] ; CHECK-NEXT: vmov.16 q3[4], r1 -; CHECK-NEXT: vmov.u8 r1, q2[13] +; CHECK-NEXT: vmov.u8 r1, q2[5] ; CHECK-NEXT: vmov.16 q3[5], r1 -; CHECK-NEXT: vmov.u8 r1, q2[14] +; CHECK-NEXT: vmov.u8 r1, q2[6] ; CHECK-NEXT: vmov.16 q3[6], r1 -; CHECK-NEXT: vmov.u8 r1, q2[15] +; CHECK-NEXT: vmov.u8 r1, q2[7] ; CHECK-NEXT: vmov.16 q3[7], r1 -; CHECK-NEXT: vmov.u8 r1, q1[8] +; CHECK-NEXT: vmov.u8 r1, q1[0] ; CHECK-NEXT: vcmp.i16 ne, q3, zr ; CHECK-NEXT: vmov.16 q3[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[9] +; CHECK-NEXT: vmov.u8 r1, q1[1] ; CHECK-NEXT: vmov.16 q3[1], r1 -; CHECK-NEXT: vmov.u8 r1, q1[10] +; CHECK-NEXT: vmov.u8 r1, q1[2] ; CHECK-NEXT: vmov.16 q3[2], r1 -; CHECK-NEXT: vmov.u8 r1, q1[11] +; CHECK-NEXT: vmov.u8 r1, q1[3] ; CHECK-NEXT: vmov.16 q3[3], r1 -; CHECK-NEXT: vmov.u8 r1, q1[12] +; CHECK-NEXT: vmov.u8 r1, q1[4] ; CHECK-NEXT: vmov.16 q3[4], r1 -; CHECK-NEXT: vmov.u8 r1, q1[13] +; CHECK-NEXT: vmov.u8 r1, q1[5] ; CHECK-NEXT: vmov.16 q3[5], r1 -; CHECK-NEXT: vmov.u8 r1, q1[14] +; CHECK-NEXT: vmov.u8 r1, q1[6] ; CHECK-NEXT: vmov.16 q3[6], r1 -; CHECK-NEXT: vmov.u8 r1, q1[15] +; CHECK-NEXT: vmov.u8 r1, q1[7] ; CHECK-NEXT: vmov.16 q3[7], r1 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: vmovlb.u8 q5, q3 +; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: vmovlb.u8 q4, q3 ; CHECK-NEXT: vmov.16 q3[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: vmov.u8 r1, q0[1] ; CHECK-NEXT: vmov.16 q3[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: vmov.u8 r1, q0[2] ; CHECK-NEXT: vmov.16 q3[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: vmov.u8 r1, q0[3] ; CHECK-NEXT: vmov.16 q3[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: vmov.u8 r1, q0[4] ; CHECK-NEXT: vmov.16 q3[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: vmov.u8 r1, q0[5] ; CHECK-NEXT: vmov.16 q3[5], r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: vmov.u8 r1, q0[6] ; CHECK-NEXT: vmov.16 q3[6], r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r1, q0[7] ; CHECK-NEXT: vmov.16 q3[7], r1 -; CHECK-NEXT: vmov.u8 r1, q2[0] -; CHECK-NEXT: vmovlb.u8 q6, q3 +; CHECK-NEXT: vmov.u8 r1, q2[8] +; CHECK-NEXT: vmovlb.u8 q5, q3 ; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i16 q4, q6, q5 -; CHECK-NEXT: vmov.16 q5[0], r1 -; CHECK-NEXT: vmov.u8 r1, q2[1] -; CHECK-NEXT: vmov.16 q5[1], r1 -; CHECK-NEXT: vmov.u8 r1, q2[2] -; CHECK-NEXT: vmov.16 q5[2], r1 -; CHECK-NEXT: vmov.u8 r1, q2[3] -; CHECK-NEXT: vmov.16 q5[3], r1 -; CHECK-NEXT: vmov.u8 r1, q2[4] -; CHECK-NEXT: vmov.16 q5[4], r1 -; CHECK-NEXT: vmov.u8 r1, q2[5] -; CHECK-NEXT: vmov.16 q5[5], r1 -; CHECK-NEXT: vmov.u8 r1, q2[6] -; CHECK-NEXT: vmov.16 q5[6], r1 -; CHECK-NEXT: vmov.u8 r1, q2[7] -; CHECK-NEXT: vmov.16 q5[7], r1 -; CHECK-NEXT: vmov.u8 r1, q1[0] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[1] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q1[2] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q1[3] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q1[4] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u8 r1, q1[5] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u8 r1, q1[6] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u8 r1, q1[7] -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmovlb.u8 q1, q2 +; CHECK-NEXT: vmult.i16 q3, q5, q4 +; CHECK-NEXT: vmov.16 q4[0], r1 +; CHECK-NEXT: vmov.u8 r1, q2[9] +; CHECK-NEXT: vmov.16 q4[1], r1 +; CHECK-NEXT: vmov.u8 r1, q2[10] +; CHECK-NEXT: vmov.16 q4[2], r1 +; CHECK-NEXT: vmov.u8 r1, q2[11] +; CHECK-NEXT: vmov.16 q4[3], r1 +; CHECK-NEXT: vmov.u8 r1, q2[12] +; CHECK-NEXT: vmov.16 q4[4], r1 +; CHECK-NEXT: vmov.u8 r1, q2[13] +; CHECK-NEXT: vmov.16 q4[5], r1 +; CHECK-NEXT: vmov.u8 r1, q2[14] +; CHECK-NEXT: vmov.16 q4[6], r1 +; CHECK-NEXT: vmov.u8 r1, q2[15] +; CHECK-NEXT: vmov.16 q4[7], r1 +; CHECK-NEXT: vmov.u8 r1, q1[8] ; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: vmov.u8 r1, q1[9] ; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: vmov.u8 r1, q1[10] ; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: vmov.u8 r1, q1[11] ; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.u8 r1, q1[12] ; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: vmov.u8 r1, q1[13] ; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: vmov.u8 r1, q1[14] ; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: vmov.u8 r1, q1[15] ; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vmovlb.u8 q0, q2 -; CHECK-NEXT: vpt.i16 ne, q5, zr -; CHECK-NEXT: vmult.i16 q3, q0, q1 -; CHECK-NEXT: vadd.i16 q0, q3, q4 -; CHECK-NEXT: vaddva.u16 r0, q0 +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: vmov.16 q1[0], r1 +; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: vmov.16 q1[1], r1 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: vmov.16 q1[2], r1 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: vmov.16 q1[3], r1 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: vmov.16 q1[4], r1 +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: vmov.16 q1[5], r1 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: vmov.16 q1[6], r1 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.16 q1[7], r1 +; CHECK-NEXT: vmullb.u8 q0, q1, q2 +; CHECK-NEXT: vpt.i16 ne, q4, zr +; CHECK-NEXT: vaddt.i16 q3, q3, q0 +; CHECK-NEXT: vaddva.u16 r0, q3 ; CHECK-NEXT: uxth r0, r0 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %c = icmp eq <16 x i8> %b, zeroinitializer @@ -4110,123 +4077,120 @@ entry: define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) { ; CHECK-LABEL: add_v16i8_v16i16_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vcmp.i8 eq, q2, zr ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vmov.u8 r1, q2[8] +; CHECK-NEXT: vmov.u8 r1, q2[0] ; CHECK-NEXT: vmov.16 q3[0], r1 -; CHECK-NEXT: vmov.u8 r1, q2[9] +; CHECK-NEXT: vmov.u8 r1, q2[1] ; CHECK-NEXT: vmov.16 q3[1], r1 -; CHECK-NEXT: vmov.u8 r1, q2[10] +; CHECK-NEXT: vmov.u8 r1, q2[2] ; CHECK-NEXT: vmov.16 q3[2], r1 -; CHECK-NEXT: vmov.u8 r1, q2[11] +; CHECK-NEXT: vmov.u8 r1, q2[3] ; CHECK-NEXT: vmov.16 q3[3], r1 -; CHECK-NEXT: vmov.u8 r1, q2[12] +; CHECK-NEXT: vmov.u8 r1, q2[4] ; CHECK-NEXT: vmov.16 q3[4], r1 -; CHECK-NEXT: vmov.u8 r1, q2[13] +; CHECK-NEXT: vmov.u8 r1, q2[5] ; CHECK-NEXT: vmov.16 q3[5], r1 -; CHECK-NEXT: vmov.u8 r1, q2[14] +; CHECK-NEXT: vmov.u8 r1, q2[6] ; CHECK-NEXT: vmov.16 q3[6], r1 -; CHECK-NEXT: vmov.u8 r1, q2[15] +; CHECK-NEXT: vmov.u8 r1, q2[7] ; CHECK-NEXT: vmov.16 q3[7], r1 -; CHECK-NEXT: vmov.u8 r1, q1[8] +; CHECK-NEXT: vmov.u8 r1, q1[0] ; CHECK-NEXT: vcmp.i16 ne, q3, zr ; CHECK-NEXT: vmov.16 q3[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[9] +; CHECK-NEXT: vmov.u8 r1, q1[1] ; CHECK-NEXT: vmov.16 q3[1], r1 -; CHECK-NEXT: vmov.u8 r1, q1[10] +; CHECK-NEXT: vmov.u8 r1, q1[2] ; CHECK-NEXT: vmov.16 q3[2], r1 -; CHECK-NEXT: vmov.u8 r1, q1[11] +; CHECK-NEXT: vmov.u8 r1, q1[3] ; CHECK-NEXT: vmov.16 q3[3], r1 -; CHECK-NEXT: vmov.u8 r1, q1[12] +; CHECK-NEXT: vmov.u8 r1, q1[4] ; CHECK-NEXT: vmov.16 q3[4], r1 -; CHECK-NEXT: vmov.u8 r1, q1[13] +; CHECK-NEXT: vmov.u8 r1, q1[5] ; CHECK-NEXT: vmov.16 q3[5], r1 -; CHECK-NEXT: vmov.u8 r1, q1[14] +; CHECK-NEXT: vmov.u8 r1, q1[6] ; CHECK-NEXT: vmov.16 q3[6], r1 -; CHECK-NEXT: vmov.u8 r1, q1[15] +; CHECK-NEXT: vmov.u8 r1, q1[7] ; CHECK-NEXT: vmov.16 q3[7], r1 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: vmovlb.s8 q5, q3 +; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: vmovlb.s8 q4, q3 ; CHECK-NEXT: vmov.16 q3[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: vmov.u8 r1, q0[1] ; CHECK-NEXT: vmov.16 q3[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: vmov.u8 r1, q0[2] ; CHECK-NEXT: vmov.16 q3[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: vmov.u8 r1, q0[3] ; CHECK-NEXT: vmov.16 q3[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: vmov.u8 r1, q0[4] ; CHECK-NEXT: vmov.16 q3[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: vmov.u8 r1, q0[5] ; CHECK-NEXT: vmov.16 q3[5], r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: vmov.u8 r1, q0[6] ; CHECK-NEXT: vmov.16 q3[6], r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r1, q0[7] ; CHECK-NEXT: vmov.16 q3[7], r1 -; CHECK-NEXT: vmov.u8 r1, q2[0] -; CHECK-NEXT: vmovlb.s8 q6, q3 +; CHECK-NEXT: vmov.u8 r1, q2[8] +; CHECK-NEXT: vmovlb.s8 q5, q3 ; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i16 q4, q6, q5 -; CHECK-NEXT: vmov.16 q5[0], r1 -; CHECK-NEXT: vmov.u8 r1, q2[1] -; CHECK-NEXT: vmov.16 q5[1], r1 -; CHECK-NEXT: vmov.u8 r1, q2[2] -; CHECK-NEXT: vmov.16 q5[2], r1 -; CHECK-NEXT: vmov.u8 r1, q2[3] -; CHECK-NEXT: vmov.16 q5[3], r1 -; CHECK-NEXT: vmov.u8 r1, q2[4] -; CHECK-NEXT: vmov.16 q5[4], r1 -; CHECK-NEXT: vmov.u8 r1, q2[5] -; CHECK-NEXT: vmov.16 q5[5], r1 -; CHECK-NEXT: vmov.u8 r1, q2[6] -; CHECK-NEXT: vmov.16 q5[6], r1 -; CHECK-NEXT: vmov.u8 r1, q2[7] -; CHECK-NEXT: vmov.16 q5[7], r1 -; CHECK-NEXT: vmov.u8 r1, q1[0] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[1] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q1[2] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q1[3] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q1[4] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u8 r1, q1[5] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u8 r1, q1[6] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u8 r1, q1[7] -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmovlb.s8 q1, q2 +; CHECK-NEXT: vmult.i16 q3, q5, q4 +; CHECK-NEXT: vmov.16 q4[0], r1 +; CHECK-NEXT: vmov.u8 r1, q2[9] +; CHECK-NEXT: vmov.16 q4[1], r1 +; CHECK-NEXT: vmov.u8 r1, q2[10] +; CHECK-NEXT: vmov.16 q4[2], r1 +; CHECK-NEXT: vmov.u8 r1, q2[11] +; CHECK-NEXT: vmov.16 q4[3], r1 +; CHECK-NEXT: vmov.u8 r1, q2[12] +; CHECK-NEXT: vmov.16 q4[4], r1 +; CHECK-NEXT: vmov.u8 r1, q2[13] +; CHECK-NEXT: vmov.16 q4[5], r1 +; CHECK-NEXT: vmov.u8 r1, q2[14] +; CHECK-NEXT: vmov.16 q4[6], r1 +; CHECK-NEXT: vmov.u8 r1, q2[15] +; CHECK-NEXT: vmov.16 q4[7], r1 +; CHECK-NEXT: vmov.u8 r1, q1[8] ; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: vmov.u8 r1, q1[9] ; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: vmov.u8 r1, q1[10] ; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: vmov.u8 r1, q1[11] ; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.u8 r1, q1[12] ; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: vmov.u8 r1, q1[13] ; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: vmov.u8 r1, q1[14] ; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: vmov.u8 r1, q1[15] ; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vmovlb.s8 q0, q2 -; CHECK-NEXT: vpt.i16 ne, q5, zr -; CHECK-NEXT: vmult.i16 q3, q0, q1 -; CHECK-NEXT: vadd.i16 q0, q3, q4 -; CHECK-NEXT: vaddva.u16 r0, q0 +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: vmov.16 q1[0], r1 +; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: vmov.16 q1[1], r1 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: vmov.16 q1[2], r1 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: vmov.16 q1[3], r1 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: vmov.16 q1[4], r1 +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: vmov.16 q1[5], r1 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: vmov.16 q1[6], r1 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.16 q1[7], r1 +; CHECK-NEXT: vmullb.s8 q0, q1, q2 +; CHECK-NEXT: vpt.i16 ne, q4, zr +; CHECK-NEXT: vaddt.i16 q3, q3, q0 +; CHECK-NEXT: vaddva.u16 r0, q3 ; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %c = icmp eq <16 x i8> %b, zeroinitializer diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp index 6fdc116..d8588c4 100644 --- a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp +++ b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp @@ -3589,6 +3589,9 @@ static bool hasNullFragReference(DagInit *DI) { if (Operator->getName() == "null_frag") return true; // If any of the arguments reference the null fragment, return true. for (unsigned i = 0, e = DI->getNumArgs(); i != e; ++i) { + if (auto Arg = dyn_cast(DI->getArg(i))) + if (Arg->getDef()->getName() == "null_frag") + return true; DagInit *Arg = dyn_cast(DI->getArg(i)); if (Arg && hasNullFragReference(Arg)) return true;