From 543236232c79221c4da261246e49888844697539 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 14 Oct 2019 15:19:33 +0000 Subject: [PATCH] [ARM] Selection for MVE VMOVN The adds both VMOVNt and VMOVNb instruction selection from the appropriate shuffles. We detect shuffle masks of the form: 0, N, 2, N+2, 4, N+4, ... or 0, N+1, 2, N+3, 4, N+5, ... ISel will also try the opposite patterns, with inputs reversed. These are selected to VMOVNt and VMOVNb respectively. Differential Revision: https://reviews.llvm.org/D68283 llvm-svn: 374781 --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 35 ++++ llvm/lib/Target/ARM/ARMISelLowering.h | 1 + llvm/lib/Target/ARM/ARMInstrMVE.td | 12 ++ llvm/test/CodeGen/Thumb2/mve-vmovn.ll | 318 ++------------------------------ 4 files changed, 66 insertions(+), 300 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index d8b2c55..71d53a3 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1604,6 +1604,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VTRN: return "ARMISD::VTRN"; case ARMISD::VTBL1: return "ARMISD::VTBL1"; case ARMISD::VTBL2: return "ARMISD::VTBL2"; + case ARMISD::VMOVN: return "ARMISD::VMOVN"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; case ARMISD::UMAAL: return "ARMISD::UMAAL"; @@ -6891,6 +6892,29 @@ static bool isReverseMask(ArrayRef M, EVT VT) { return true; } +static bool isVMOVNMask(ArrayRef M, EVT VT, bool Top) { + unsigned NumElts = VT.getVectorNumElements(); + // Make sure the mask has the right size. + if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) + return false; + + // If Top + // Look for <0, N, 2, N+2, 4, N+4, ..>. + // This inserts Input2 into Input1 + // else if not Top + // Look for <0, N+1, 2, N+3, 4, N+5, ..> + // This inserts Input1 into Input2 + unsigned Offset = Top ? 0 : 1; + for (unsigned i = 0; i < NumElts; i+=2) { + if (M[i] >= 0 && M[i] != (int)i) + return false; + if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset)) + return false; + } + + return true; +} + // If N is an integer constant that can be moved into a register in one // instruction, return an SDValue of such a constant (will become a MOV // instruction). Otherwise return null. @@ -7485,6 +7509,9 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)) return true; + else if (Subtarget->hasMVEIntegerOps() && + (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1))) + return true; else return false; } @@ -7760,6 +7787,14 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, .getValue(WhichResult); } } + if (ST->hasMVEIntegerOps()) { + if (isVMOVNMask(ShuffleMask, VT, 0)) + return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1, + DAG.getConstant(0, dl, MVT::i32)); + if (isVMOVNMask(ShuffleMask, VT, 1)) + return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2, + DAG.getConstant(1, dl, MVT::i32)); + } // Also check for these shuffles through CONCAT_VECTORS: we canonicalize // shuffles that produce a result larger than their operands with: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index a89ef25..98e0684 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -197,6 +197,7 @@ class VectorType; VTRN, // transpose VTBL1, // 1-register shuffle with mask VTBL2, // 2-register shuffle with mask + VMOVN, // MVE vmovn // Vector multiply long: VMULLs, // ...signed diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 80b45ce..dad9d51 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -3501,6 +3501,18 @@ defm MVE_VQMOVNu32 : MVE_VxMOVxN_halves<"vqmovn", "u32", 0b1, 0b1, 0b01>; defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>; defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>; +def MVEvmovn : SDNode<"ARMISD::VMOVN", SDTARMVEXT>; +let Predicates = [HasMVEInt] in { + def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))), + (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))), + (v8i16 (MVE_VMOVNi32th (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 0))), + (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))), + (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; +} + class MVE_VCVT_ff pattern=[]> : MVE_qDest_qSrc @vmovn32_trunc1(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vmovn32_trunc1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmovnt.i32 q0, q1 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> @@ -31,23 +15,8 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc2(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vmovn32_trunc2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmovnt.i32 q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> @@ -58,39 +27,7 @@ entry: define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc1(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vmovn16_trunc1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmovnt.i16 q0, q1 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> @@ -101,39 +38,8 @@ entry: define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc2(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vmovn16_trunc2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmovnt.i16 q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> @@ -297,23 +203,7 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmovn16_t1(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vmovn16_t1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmovnt.i32 q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> @@ -323,23 +213,8 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmovn16_t2(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vmovn16_t2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmovnt.i32 q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> @@ -349,23 +224,8 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmovn16_b1(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vmovn16_b1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmovnb.i32 q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> @@ -427,23 +287,7 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmovn16_b4(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vmovn16_b4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmovnb.i32 q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> @@ -454,39 +298,7 @@ entry: define arm_aapcs_vfpcc <16 x i8> @vmovn8_b1(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: vmovn8_b1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmovnt.i16 q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> @@ -496,39 +308,8 @@ entry: define arm_aapcs_vfpcc <16 x i8> @vmovn8_b2(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: vmovn8_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmovnt.i16 q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> @@ -538,39 +319,8 @@ entry: define arm_aapcs_vfpcc <16 x i8> @vmovn8_t1(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: vmovn8_t1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmovnb.i16 q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> @@ -664,39 +414,7 @@ entry: define arm_aapcs_vfpcc <16 x i8> @vmovn8_t4(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: vmovn8_t4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmovnb.i16 q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> -- 2.7.4