From 6505124a0c7c648560aad88bac103b0738a8b5f0 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 12 Feb 2020 18:18:23 +0000 Subject: [PATCH] [ARM] Extra vmovn tests to show BE differences. NFC --- llvm/test/CodeGen/ARM/neon-vmovn.ll | 794 ++++++++++++++++++++++++++++++++++ llvm/test/CodeGen/Thumb2/mve-vmovn.ll | 322 ++++++++++++++ 2 files changed, 1116 insertions(+) create mode 100644 llvm/test/CodeGen/ARM/neon-vmovn.ll diff --git a/llvm/test/CodeGen/ARM/neon-vmovn.ll b/llvm/test/CodeGen/ARM/neon-vmovn.ll new file mode 100644 index 0000000..675a38a --- /dev/null +++ b/llvm/test/CodeGen/ARM/neon-vmovn.ll @@ -0,0 +1,794 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=armv8-arm-none-eabi -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=armebv8-arm-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECKBE + +; This is the same as Thumb2/mve-vmovn.ll, testing the same patterns for neon +; under both both LE and BE. The vmovn instruction is very different between +; mve and neon, so these tests are not necessarily expected to generate a (neon) +; vmovn. + +define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc1(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: vmovn32_trunc1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vzip.32 q0, q1 +; CHECK-NEXT: vmovn.i32 d17, q1 +; CHECK-NEXT: vmovn.i32 d16, q0 +; CHECK-NEXT: vorr q0, q8, q8 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_trunc1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.32 q8, q1 +; CHECKBE-NEXT: vrev64.32 q9, q0 +; CHECKBE-NEXT: vzip.32 q9, q8 +; CHECKBE-NEXT: vmovn.i32 d17, q8 +; CHECKBE-NEXT: vmovn.i32 d16, q9 +; CHECKBE-NEXT: vrev64.16 q0, q8 +; CHECKBE-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> + %out = trunc <8 x i32> %strided.vec to <8 x i16> + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc2(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: vmovn32_trunc2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vzip.32 q1, q0 +; CHECK-NEXT: vmovn.i32 d1, q0 +; CHECK-NEXT: vmovn.i32 d0, q1 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_trunc2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.32 q8, q0 +; CHECKBE-NEXT: vrev64.32 q9, q1 +; CHECKBE-NEXT: vzip.32 q9, q8 +; CHECKBE-NEXT: vmovn.i32 d17, q8 +; CHECKBE-NEXT: vmovn.i32 d16, q9 +; CHECKBE-NEXT: vrev64.16 q0, q8 +; CHECKBE-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> + %out = trunc <8 x i32> %strided.vec to <8 x i16> + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc1(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: vmovn16_trunc1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vzip.16 q0, q1 +; CHECK-NEXT: vmovn.i16 d17, q1 +; CHECK-NEXT: vmovn.i16 d16, q0 +; CHECK-NEXT: vorr q0, q8, q8 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_trunc1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 q8, q1 +; CHECKBE-NEXT: vrev64.16 q9, q0 +; CHECKBE-NEXT: vzip.16 q9, q8 +; CHECKBE-NEXT: vmovn.i16 d17, q8 +; CHECKBE-NEXT: vmovn.i16 d16, q9 +; CHECKBE-NEXT: vrev64.8 q0, q8 +; CHECKBE-NEXT: bx lr +entry: + %strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> + %out = trunc <16 x i16> %strided.vec to <16 x i8> + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc2(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: vmovn16_trunc2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vzip.16 q1, q0 +; CHECK-NEXT: vmovn.i16 d1, q0 +; CHECK-NEXT: vmovn.i16 d0, q1 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_trunc2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 q8, q0 +; CHECKBE-NEXT: vrev64.16 q9, q1 +; CHECKBE-NEXT: vzip.16 q9, q8 +; CHECKBE-NEXT: vmovn.i16 d17, q8 +; CHECKBE-NEXT: vmovn.i16 d16, q9 +; CHECKBE-NEXT: vrev64.8 q0, q8 +; CHECKBE-NEXT: bx lr +entry: + %strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> + %out = trunc <16 x i16> %strided.vec to <16 x i8> + ret <16 x i8> %out +} + + +define arm_aapcs_vfpcc <2 x i64> @vmovn64_t1(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: vmovn64_t1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f64 d1, d2 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn64_t1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.f64 d1, d2 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> + ret <2 x i64> %out +} + +define arm_aapcs_vfpcc <2 x i64> @vmovn64_t2(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: vmovn64_t2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vorr d3, d0, d0 +; CHECK-NEXT: vorr q0, q1, q1 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn64_t2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vorr d3, d0, d0 +; CHECKBE-NEXT: vorr q0, q1, q1 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> + ret <2 x i64> %out +} + +define arm_aapcs_vfpcc <2 x i64> @vmovn64_b1(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: vmovn64_b1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f64 d1, d3 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn64_b1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.f64 d1, d3 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> + ret <2 x i64> %out +} + +define arm_aapcs_vfpcc <2 x i64> @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: vmovn64_b2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f64 d16, d3 +; CHECK-NEXT: vorr d17, d0, d0 +; CHECK-NEXT: vorr q0, q8, q8 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn64_b2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.f64 d16, d3 +; CHECKBE-NEXT: vorr d17, d0, d0 +; CHECKBE-NEXT: vorr q0, q8, q8 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> + ret <2 x i64> %out +} + +define arm_aapcs_vfpcc <2 x i64> @vmovn64_b3(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: vmovn64_b3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f64 d0, d1 +; CHECK-NEXT: vmov.f64 d1, d2 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn64_b3: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.f64 d0, d1 +; CHECKBE-NEXT: vmov.f64 d1, d2 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> + ret <2 x i64> %out +} + +define arm_aapcs_vfpcc <2 x i64> @vmovn64_b4(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: vmovn64_b4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vorr d3, d1, d1 +; CHECK-NEXT: vorr q0, q1, q1 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn64_b4: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vorr d3, d1, d1 +; CHECKBE-NEXT: vorr q0, q1, q1 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> + ret <2 x i64> %out +} + + + +define arm_aapcs_vfpcc <4 x i32> @vmovn32_t1(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: vmovn32_t1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vtrn.32 q0, q1 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_t1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.32 q8, q1 +; CHECKBE-NEXT: vrev64.32 q9, q0 +; CHECKBE-NEXT: vtrn.32 q9, q8 +; CHECKBE-NEXT: vrev64.32 q0, q9 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <4 x i32> @vmovn32_t2(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: vmovn32_t2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vtrn.32 q1, q0 +; CHECK-NEXT: vorr q0, q1, q1 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_t2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.32 q8, q0 +; CHECKBE-NEXT: vrev64.32 q9, q1 +; CHECKBE-NEXT: vtrn.32 q9, q8 +; CHECKBE-NEXT: vrev64.32 q0, q9 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <4 x i32> @vmovn32_b1(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: vmovn32_b1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev64.32 q8, q0 +; CHECK-NEXT: vtrn.32 q8, q1 +; CHECK-NEXT: vorr q0, q1, q1 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_b1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.32 q8, q0 +; CHECKBE-NEXT: vrev64.32 q9, q1 +; CHECKBE-NEXT: vrev64.32 q8, q8 +; CHECKBE-NEXT: vtrn.32 q8, q9 +; CHECKBE-NEXT: vrev64.32 q0, q9 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <4 x i32> @vmovn32_b2(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: vmovn32_b2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vext.32 q8, q0, q0, #1 +; CHECK-NEXT: vtrn.32 q8, q1 +; CHECK-NEXT: vext.32 q0, q1, q1, #1 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_b2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.32 q8, q0 +; CHECKBE-NEXT: vrev64.32 q9, q1 +; CHECKBE-NEXT: vext.32 q8, q8, q8, #1 +; CHECKBE-NEXT: vtrn.32 q8, q9 +; CHECKBE-NEXT: vext.32 q8, q9, q9, #1 +; CHECKBE-NEXT: vrev64.32 q0, q8 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <4 x i32> @vmovn32_b3(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: vmovn32_b3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vorr q8, q0, q0 +; CHECK-NEXT: vtrn.32 q8, q1 +; CHECK-NEXT: vtrn.32 q0, q8 +; CHECK-NEXT: vorr q0, q8, q8 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_b3: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.32 q9, q0 +; CHECKBE-NEXT: vrev64.32 q8, q1 +; CHECKBE-NEXT: vorr q10, q9, q9 +; CHECKBE-NEXT: vtrn.32 q10, q8 +; CHECKBE-NEXT: vtrn.32 q9, q10 +; CHECKBE-NEXT: vrev64.32 q0, q10 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <4 x i32> @vmovn32_b4(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: vmovn32_b4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vorr q8, q0, q0 +; CHECK-NEXT: vtrn.32 q8, q1 +; CHECK-NEXT: vtrn.32 q8, q0 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_b4: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.32 q9, q0 +; CHECKBE-NEXT: vrev64.32 q8, q1 +; CHECKBE-NEXT: vorr q10, q9, q9 +; CHECKBE-NEXT: vtrn.32 q10, q8 +; CHECKBE-NEXT: vtrn.32 q10, q9 +; CHECKBE-NEXT: vrev64.32 q0, q9 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> + ret <4 x i32> %out +} + + + + +define arm_aapcs_vfpcc <8 x i16> @vmovn16_t1(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: vmovn16_t1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vtrn.16 q0, q1 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_t1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 q8, q1 +; CHECKBE-NEXT: vrev64.16 q9, q0 +; CHECKBE-NEXT: vtrn.16 q9, q8 +; CHECKBE-NEXT: vrev64.16 q0, q9 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <8 x i16> @vmovn16_t2(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: vmovn16_t2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vtrn.16 q1, q0 +; CHECK-NEXT: vorr q0, q1, q1 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_t2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 q8, q0 +; CHECKBE-NEXT: vrev64.16 q9, q1 +; CHECKBE-NEXT: vtrn.16 q9, q8 +; CHECKBE-NEXT: vrev64.16 q0, q9 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <8 x i16> @vmovn16_b1(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: vmovn16_b1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev32.16 d16, d1 +; CHECK-NEXT: vrev32.16 d17, d0 +; CHECK-NEXT: vtrn.16 d16, d3 +; CHECK-NEXT: vtrn.16 d17, d2 +; CHECK-NEXT: vorr q0, q1, q1 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_b1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 d16, d1 +; CHECKBE-NEXT: vrev64.16 d17, d0 +; CHECKBE-NEXT: vrev64.16 d19, d3 +; CHECKBE-NEXT: vrev32.16 d16, d16 +; CHECKBE-NEXT: vrev64.16 d18, d2 +; CHECKBE-NEXT: vrev32.16 d17, d17 +; CHECKBE-NEXT: vtrn.16 d16, d19 +; CHECKBE-NEXT: vtrn.16 d17, d18 +; CHECKBE-NEXT: vrev64.16 q0, q9 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <8 x i16> @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: vmovn16_b2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vorr d17, d3, d3 +; CHECK-NEXT: vtrn.16 d17, d1 +; CHECK-NEXT: vorr d16, d2, d2 +; CHECK-NEXT: vtrn.16 d16, d0 +; CHECK-NEXT: vtrn.16 d3, d17 +; CHECK-NEXT: vtrn.16 d2, d16 +; CHECK-NEXT: vorr q0, q8, q8 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_b2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 d17, d3 +; CHECKBE-NEXT: vorr d21, d17, d17 +; CHECKBE-NEXT: vrev64.16 d16, d1 +; CHECKBE-NEXT: vrev64.16 d19, d2 +; CHECKBE-NEXT: vrev64.16 d18, d0 +; CHECKBE-NEXT: vtrn.16 d21, d16 +; CHECKBE-NEXT: vorr d20, d19, d19 +; CHECKBE-NEXT: vtrn.16 d20, d18 +; CHECKBE-NEXT: vtrn.16 d17, d21 +; CHECKBE-NEXT: vtrn.16 d19, d20 +; CHECKBE-NEXT: vrev64.16 q0, q10 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <8 x i16> @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: vmovn16_b3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vorr d17, d1, d1 +; CHECK-NEXT: vtrn.16 d17, d3 +; CHECK-NEXT: vorr d16, d0, d0 +; CHECK-NEXT: vtrn.16 d16, d2 +; CHECK-NEXT: vtrn.16 d1, d17 +; CHECK-NEXT: vtrn.16 d0, d16 +; CHECK-NEXT: vorr q0, q8, q8 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_b3: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 d17, d1 +; CHECKBE-NEXT: vorr d21, d17, d17 +; CHECKBE-NEXT: vrev64.16 d16, d3 +; CHECKBE-NEXT: vrev64.16 d19, d0 +; CHECKBE-NEXT: vrev64.16 d18, d2 +; CHECKBE-NEXT: vtrn.16 d21, d16 +; CHECKBE-NEXT: vorr d20, d19, d19 +; CHECKBE-NEXT: vtrn.16 d20, d18 +; CHECKBE-NEXT: vtrn.16 d17, d21 +; CHECKBE-NEXT: vtrn.16 d19, d20 +; CHECKBE-NEXT: vrev64.16 q0, q10 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <8 x i16> @vmovn16_b4(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: vmovn16_b4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev32.16 d16, d3 +; CHECK-NEXT: vrev32.16 d17, d2 +; CHECK-NEXT: vtrn.16 d16, d1 +; CHECK-NEXT: vtrn.16 d17, d0 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_b4: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 d16, d3 +; CHECKBE-NEXT: vrev64.16 d17, d2 +; CHECKBE-NEXT: vrev64.16 d19, d1 +; CHECKBE-NEXT: vrev32.16 d16, d16 +; CHECKBE-NEXT: vrev64.16 d18, d0 +; CHECKBE-NEXT: vrev32.16 d17, d17 +; CHECKBE-NEXT: vtrn.16 d16, d19 +; CHECKBE-NEXT: vtrn.16 d17, d18 +; CHECKBE-NEXT: vrev64.16 q0, q9 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> + ret <8 x i16> %out +} + + +define arm_aapcs_vfpcc <16 x i8> @vmovn8_b1(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: vmovn8_b1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vtrn.8 q0, q1 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn8_b1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.8 q8, q1 +; CHECKBE-NEXT: vrev64.8 q9, q0 +; CHECKBE-NEXT: vtrn.8 q9, q8 +; CHECKBE-NEXT: vrev64.8 q0, q9 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @vmovn8_b2(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: vmovn8_b2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vtrn.8 q1, q0 +; CHECK-NEXT: vorr q0, q1, q1 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn8_b2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.8 q8, q0 +; CHECKBE-NEXT: vrev64.8 q9, q1 +; CHECKBE-NEXT: vtrn.8 q9, q8 +; CHECKBE-NEXT: vrev64.8 q0, q9 +; CHECKBE-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @vmovn8_t1(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: vmovn8_t1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vorr q2, q0, q0 +; CHECK-NEXT: vldr d16, .LCPI24_0 +; CHECK-NEXT: vorr d6, d3, d3 +; CHECK-NEXT: vtbl.8 d1, {d5, d6}, d16 +; CHECK-NEXT: vorr d5, d2, d2 +; CHECK-NEXT: vtbl.8 d0, {d4, d5}, d16 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI24_0: +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 2 @ 0x2 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 4 @ 0x4 +; CHECK-NEXT: .byte 13 @ 0xd +; CHECK-NEXT: .byte 6 @ 0x6 +; CHECK-NEXT: .byte 15 @ 0xf +; +; CHECKBE-LABEL: vmovn8_t1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vldr d16, .LCPI24_0 +; CHECKBE-NEXT: vrev64.8 d19, d3 +; CHECKBE-NEXT: vrev64.8 d21, d2 +; CHECKBE-NEXT: vrev64.8 d18, d1 +; CHECKBE-NEXT: vrev64.8 d16, d16 +; CHECKBE-NEXT: vrev64.8 d20, d0 +; CHECKBE-NEXT: vtbl.8 d19, {d18, d19}, d16 +; CHECKBE-NEXT: vtbl.8 d18, {d20, d21}, d16 +; CHECKBE-NEXT: vrev64.8 q0, q9 +; CHECKBE-NEXT: bx lr +; CHECKBE-NEXT: .p2align 3 +; CHECKBE-NEXT: @ %bb.1: +; CHECKBE-NEXT: .LCPI24_0: +; CHECKBE-NEXT: .byte 0 @ 0x0 +; CHECKBE-NEXT: .byte 9 @ 0x9 +; CHECKBE-NEXT: .byte 2 @ 0x2 +; CHECKBE-NEXT: .byte 11 @ 0xb +; CHECKBE-NEXT: .byte 4 @ 0x4 +; CHECKBE-NEXT: .byte 13 @ 0xd +; CHECKBE-NEXT: .byte 6 @ 0x6 +; CHECKBE-NEXT: .byte 15 @ 0xf +entry: + %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @vmovn8_t2(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: vmovn8_t2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $q1 killed $q1 def $d2_d3_d4 +; CHECK-NEXT: vldr d18, .LCPI25_0 +; CHECK-NEXT: vorr d4, d1, d1 +; CHECK-NEXT: vtbl.8 d17, {d3, d4}, d18 +; CHECK-NEXT: vorr d3, d0, d0 +; CHECK-NEXT: vtbl.8 d16, {d2, d3}, d18 +; CHECK-NEXT: vorr q0, q8, q8 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI25_0: +; CHECK-NEXT: .byte 1 @ 0x1 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 10 @ 0xa +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 7 @ 0x7 +; CHECK-NEXT: .byte 14 @ 0xe +; +; CHECKBE-LABEL: vmovn8_t2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vldr d16, .LCPI25_0 +; CHECKBE-NEXT: vrev64.8 d19, d1 +; CHECKBE-NEXT: vrev64.8 d21, d0 +; CHECKBE-NEXT: vrev64.8 d18, d3 +; CHECKBE-NEXT: vrev64.8 d16, d16 +; CHECKBE-NEXT: vrev64.8 d20, d2 +; CHECKBE-NEXT: vtbl.8 d19, {d18, d19}, d16 +; CHECKBE-NEXT: vtbl.8 d18, {d20, d21}, d16 +; CHECKBE-NEXT: vrev64.8 q0, q9 +; CHECKBE-NEXT: bx lr +; CHECKBE-NEXT: .p2align 3 +; CHECKBE-NEXT: @ %bb.1: +; CHECKBE-NEXT: .LCPI25_0: +; CHECKBE-NEXT: .byte 1 @ 0x1 +; CHECKBE-NEXT: .byte 8 @ 0x8 +; CHECKBE-NEXT: .byte 3 @ 0x3 +; CHECKBE-NEXT: .byte 10 @ 0xa +; CHECKBE-NEXT: .byte 5 @ 0x5 +; CHECKBE-NEXT: .byte 12 @ 0xc +; CHECKBE-NEXT: .byte 7 @ 0x7 +; CHECKBE-NEXT: .byte 14 @ 0xe +entry: + %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @vmovn8_t3(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: vmovn8_t3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vorr q2, q0, q0 +; CHECK-NEXT: vldr d16, .LCPI26_0 +; CHECK-NEXT: vorr d6, d3, d3 +; CHECK-NEXT: vtbl.8 d1, {d5, d6}, d16 +; CHECK-NEXT: vorr d5, d2, d2 +; CHECK-NEXT: vtbl.8 d0, {d4, d5}, d16 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI26_0: +; CHECK-NEXT: .byte 1 @ 0x1 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 10 @ 0xa +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 7 @ 0x7 +; CHECK-NEXT: .byte 14 @ 0xe +; +; CHECKBE-LABEL: vmovn8_t3: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vldr d16, .LCPI26_0 +; CHECKBE-NEXT: vrev64.8 d19, d3 +; CHECKBE-NEXT: vrev64.8 d21, d2 +; CHECKBE-NEXT: vrev64.8 d18, d1 +; CHECKBE-NEXT: vrev64.8 d16, d16 +; CHECKBE-NEXT: vrev64.8 d20, d0 +; CHECKBE-NEXT: vtbl.8 d19, {d18, d19}, d16 +; CHECKBE-NEXT: vtbl.8 d18, {d20, d21}, d16 +; CHECKBE-NEXT: vrev64.8 q0, q9 +; CHECKBE-NEXT: bx lr +; CHECKBE-NEXT: .p2align 3 +; CHECKBE-NEXT: @ %bb.1: +; CHECKBE-NEXT: .LCPI26_0: +; CHECKBE-NEXT: .byte 1 @ 0x1 +; CHECKBE-NEXT: .byte 8 @ 0x8 +; CHECKBE-NEXT: .byte 3 @ 0x3 +; CHECKBE-NEXT: .byte 10 @ 0xa +; CHECKBE-NEXT: .byte 5 @ 0x5 +; CHECKBE-NEXT: .byte 12 @ 0xc +; CHECKBE-NEXT: .byte 7 @ 0x7 +; CHECKBE-NEXT: .byte 14 @ 0xe +entry: + %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @vmovn8_t4(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: vmovn8_t4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $q1 killed $q1 def $d2_d3_d4 +; CHECK-NEXT: vldr d18, .LCPI27_0 +; CHECK-NEXT: vorr d4, d1, d1 +; CHECK-NEXT: vtbl.8 d17, {d3, d4}, d18 +; CHECK-NEXT: vorr d3, d0, d0 +; CHECK-NEXT: vtbl.8 d16, {d2, d3}, d18 +; CHECK-NEXT: vorr q0, q8, q8 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI27_0: +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 2 @ 0x2 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 4 @ 0x4 +; CHECK-NEXT: .byte 13 @ 0xd +; CHECK-NEXT: .byte 6 @ 0x6 +; CHECK-NEXT: .byte 15 @ 0xf +; +; CHECKBE-LABEL: vmovn8_t4: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vldr d16, .LCPI27_0 +; CHECKBE-NEXT: vrev64.8 d19, d1 +; CHECKBE-NEXT: vrev64.8 d21, d0 +; CHECKBE-NEXT: vrev64.8 d18, d3 +; CHECKBE-NEXT: vrev64.8 d16, d16 +; CHECKBE-NEXT: vrev64.8 d20, d2 +; CHECKBE-NEXT: vtbl.8 d19, {d18, d19}, d16 +; CHECKBE-NEXT: vtbl.8 d18, {d20, d21}, d16 +; CHECKBE-NEXT: vrev64.8 q0, q9 +; CHECKBE-NEXT: bx lr +; CHECKBE-NEXT: .p2align 3 +; CHECKBE-NEXT: @ %bb.1: +; CHECKBE-NEXT: .LCPI27_0: +; CHECKBE-NEXT: .byte 0 @ 0x0 +; CHECKBE-NEXT: .byte 9 @ 0x9 +; CHECKBE-NEXT: .byte 2 @ 0x2 +; CHECKBE-NEXT: .byte 11 @ 0xb +; CHECKBE-NEXT: .byte 4 @ 0x4 +; CHECKBE-NEXT: .byte 13 @ 0xd +; CHECKBE-NEXT: .byte 6 @ 0x6 +; CHECKBE-NEXT: .byte 15 @ 0xf +entry: + %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @test(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: test: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vtrn.8 q0, q1 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: test: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.8 q8, q1 +; CHECKBE-NEXT: vrev64.8 q9, q0 +; CHECKBE-NEXT: vtrn.8 q9, q8 +; CHECKBE-NEXT: vrev64.8 q0, q9 +; CHECKBE-NEXT: bx lr +entry: + %a0 = extractelement <8 x i16> %src1, i32 0 + %a1 = extractelement <8 x i16> %src1, i32 1 + %a2 = extractelement <8 x i16> %src1, i32 2 + %a3 = extractelement <8 x i16> %src1, i32 3 + %a4 = extractelement <8 x i16> %src1, i32 4 + %a5 = extractelement <8 x i16> %src1, i32 5 + %a6 = extractelement <8 x i16> %src1, i32 6 + %a7 = extractelement <8 x i16> %src1, i32 7 + + %b0 = extractelement <8 x i16> %src2, i32 0 + %b1 = extractelement <8 x i16> %src2, i32 1 + %b2 = extractelement <8 x i16> %src2, i32 2 + %b3 = extractelement <8 x i16> %src2, i32 3 + %b4 = extractelement <8 x i16> %src2, i32 4 + %b5 = extractelement <8 x i16> %src2, i32 5 + %b6 = extractelement <8 x i16> %src2, i32 6 + %b7 = extractelement <8 x i16> %src2, i32 7 + + %s0 = trunc i16 %a0 to i8 + %s1 = trunc i16 %a1 to i8 + %s2 = trunc i16 %a2 to i8 + %s3 = trunc i16 %a3 to i8 + %s4 = trunc i16 %a4 to i8 + %s5 = trunc i16 %a5 to i8 + %s6 = trunc i16 %a6 to i8 + %s7 = trunc i16 %a7 to i8 + %t0 = trunc i16 %b0 to i8 + %t1 = trunc i16 %b1 to i8 + %t2 = trunc i16 %b2 to i8 + %t3 = trunc i16 %b3 to i8 + %t4 = trunc i16 %b4 to i8 + %t5 = trunc i16 %b5 to i8 + %t6 = trunc i16 %b6 to i8 + %t7 = trunc i16 %b7 to i8 + + %r0 = insertelement <16 x i8> undef, i8 %s0, i32 0 + %r1 = insertelement <16 x i8> %r0, i8 %s1, i32 2 + %r2 = insertelement <16 x i8> %r1, i8 %s2, i32 4 + %r3 = insertelement <16 x i8> %r2, i8 %s3, i32 6 + %r4 = insertelement <16 x i8> %r3, i8 %s4, i32 8 + %r5 = insertelement <16 x i8> %r4, i8 %s5, i32 10 + %r6 = insertelement <16 x i8> %r5, i8 %s6, i32 12 + %r7 = insertelement <16 x i8> %r6, i8 %s7, i32 14 + %r10 = insertelement <16 x i8> %r7, i8 %t0, i32 1 + %r11 = insertelement <16 x i8> %r10, i8 %t1, i32 3 + %r12 = insertelement <16 x i8> %r11, i8 %t2, i32 5 + %r13 = insertelement <16 x i8> %r12, i8 %t3, i32 7 + %r14 = insertelement <16 x i8> %r13, i8 %t4, i32 9 + %r15 = insertelement <16 x i8> %r14, i8 %t5, i32 11 + %r16 = insertelement <16 x i8> %r15, i8 %t6, i32 13 + %r17 = insertelement <16 x i8> %r16, i8 %t7, i32 15 + + ret <16 x i8> %r17 +} diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll index 20f790d..ec34bdd 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll @@ -1,11 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECKBE define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc1(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vmovn32_trunc1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmovnt.i32 q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_trunc1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 q2, q1 +; CHECKBE-NEXT: vrev64.16 q1, q0 +; CHECKBE-NEXT: vmovnt.i32 q1, q2 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr entry: %strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> %out = trunc <8 x i32> %strided.vec to <8 x i16> @@ -18,6 +27,14 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc2(<4 x i32> %src1, <4 x i32> %src ; CHECK-NEXT: vmovnt.i32 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_trunc2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 q2, q0 +; CHECKBE-NEXT: vrev64.16 q3, q1 +; CHECKBE-NEXT: vmovnt.i32 q3, q2 +; CHECKBE-NEXT: vrev64.16 q0, q3 +; CHECKBE-NEXT: bx lr entry: %strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> %out = trunc <8 x i32> %strided.vec to <8 x i16> @@ -29,6 +46,14 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc1(<8 x i16> %src1, <8 x i16> %src ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmovnt.i16 q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_trunc1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.8 q2, q1 +; CHECKBE-NEXT: vrev64.8 q1, q0 +; CHECKBE-NEXT: vmovnt.i16 q1, q2 +; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: bx lr entry: %strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> %out = trunc <16 x i16> %strided.vec to <16 x i8> @@ -41,6 +66,14 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc2(<8 x i16> %src1, <8 x i16> %src ; CHECK-NEXT: vmovnt.i16 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_trunc2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.8 q2, q0 +; CHECKBE-NEXT: vrev64.8 q3, q1 +; CHECKBE-NEXT: vmovnt.i16 q3, q2 +; CHECKBE-NEXT: vrev64.8 q0, q3 +; CHECKBE-NEXT: bx lr entry: %strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> %out = trunc <16 x i16> %strided.vec to <16 x i8> @@ -54,6 +87,12 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_t1(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn64_t1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.f32 s2, s4 +; CHECKBE-NEXT: vmov.f32 s3, s5 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> ret <2 x i64> %out @@ -66,6 +105,13 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_t2(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn64_t2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.f32 s6, s0 +; CHECKBE-NEXT: vmov.f32 s7, s1 +; CHECKBE-NEXT: vmov q0, q1 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> ret <2 x i64> %out @@ -77,6 +123,12 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_b1(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-NEXT: vmov.f32 s2, s6 ; CHECK-NEXT: vmov.f32 s3, s7 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn64_b1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.f32 s2, s6 +; CHECKBE-NEXT: vmov.f32 s3, s7 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> ret <2 x i64> %out @@ -91,6 +143,15 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn64_b2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.f32 s4, s6 +; CHECKBE-NEXT: vmov.f32 s5, s7 +; CHECKBE-NEXT: vmov.f32 s6, s0 +; CHECKBE-NEXT: vmov.f32 s7, s1 +; CHECKBE-NEXT: vmov q0, q1 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> ret <2 x i64> %out @@ -104,6 +165,14 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_b3(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn64_b3: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.f32 s0, s2 +; CHECKBE-NEXT: vmov.f32 s1, s3 +; CHECKBE-NEXT: vmov.f32 s2, s4 +; CHECKBE-NEXT: vmov.f32 s3, s5 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> ret <2 x i64> %out @@ -116,6 +185,13 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_b4(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn64_b4: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.f32 s6, s2 +; CHECKBE-NEXT: vmov.f32 s7, s3 +; CHECKBE-NEXT: vmov q0, q1 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> ret <2 x i64> %out @@ -129,6 +205,15 @@ define arm_aapcs_vfpcc <4 x i32> @vmovn32_t1(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_t1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.32 q2, q1 +; CHECKBE-NEXT: vrev64.32 q1, q0 +; CHECKBE-NEXT: vmov.f32 s5, s8 +; CHECKBE-NEXT: vmov.f32 s7, s10 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> ret <4 x i32> %out @@ -141,6 +226,15 @@ define arm_aapcs_vfpcc <4 x i32> @vmovn32_t2(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-NEXT: vmov.f32 s7, s2 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_t2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: vrev64.32 q3, q1 +; CHECKBE-NEXT: vmov.f32 s13, s8 +; CHECKBE-NEXT: vmov.f32 s15, s10 +; CHECKBE-NEXT: vrev64.32 q0, q3 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> ret <4 x i32> %out @@ -152,6 +246,15 @@ define arm_aapcs_vfpcc <4 x i32> @vmovn32_b1(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-NEXT: vmov.f32 s1, s5 ; CHECK-NEXT: vmov.f32 s3, s7 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_b1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.32 q2, q1 +; CHECKBE-NEXT: vrev64.32 q1, q0 +; CHECKBE-NEXT: vmov.f32 s5, s9 +; CHECKBE-NEXT: vmov.f32 s7, s11 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> ret <4 x i32> %out @@ -166,6 +269,17 @@ define arm_aapcs_vfpcc <4 x i32> @vmovn32_b2(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-NEXT: vmov.f32 s11, s2 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_b2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: vmov.f32 s4, s1 +; CHECKBE-NEXT: vmov.f32 s5, s8 +; CHECKBE-NEXT: vmov.f32 s6, s3 +; CHECKBE-NEXT: vmov.f32 s7, s10 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> ret <4 x i32> %out @@ -180,6 +294,17 @@ define arm_aapcs_vfpcc <4 x i32> @vmovn32_b3(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-NEXT: vmov.f32 s11, s6 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_b3: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.32 q2, q1 +; CHECKBE-NEXT: vrev64.32 q1, q0 +; CHECKBE-NEXT: vmov.f32 s12, s5 +; CHECKBE-NEXT: vmov.f32 s13, s8 +; CHECKBE-NEXT: vmov.f32 s14, s7 +; CHECKBE-NEXT: vmov.f32 s15, s10 +; CHECKBE-NEXT: vrev64.32 q0, q3 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> ret <4 x i32> %out @@ -192,6 +317,15 @@ define arm_aapcs_vfpcc <4 x i32> @vmovn32_b4(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_b4: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: vrev64.32 q3, q1 +; CHECKBE-NEXT: vmov.f32 s13, s9 +; CHECKBE-NEXT: vmov.f32 s15, s11 +; CHECKBE-NEXT: vrev64.32 q0, q3 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> ret <4 x i32> %out @@ -205,6 +339,14 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn16_t1(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmovnt.i32 q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_t1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 q2, q1 +; CHECKBE-NEXT: vrev64.16 q1, q0 +; CHECKBE-NEXT: vmovnt.i32 q1, q2 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> ret <8 x i16> %out @@ -216,6 +358,14 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn16_t2(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-NEXT: vmovnt.i32 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_t2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 q2, q0 +; CHECKBE-NEXT: vrev64.16 q3, q1 +; CHECKBE-NEXT: vmovnt.i32 q3, q2 +; CHECKBE-NEXT: vrev64.16 q0, q3 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> ret <8 x i16> %out @@ -227,6 +377,14 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn16_b1(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-NEXT: vmovnb.i32 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_b1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 q2, q0 +; CHECKBE-NEXT: vrev64.16 q3, q1 +; CHECKBE-NEXT: vmovnb.i32 q3, q2 +; CHECKBE-NEXT: vrev64.16 q0, q3 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> ret <8 x i16> %out @@ -253,6 +411,29 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-NEXT: vmov.u16 r0, q2[6] ; CHECK-NEXT: vmov.16 q0[7], r0 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_b2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 q2, q1 +; CHECKBE-NEXT: vrev64.16 q3, q0 +; CHECKBE-NEXT: vmov.u16 r0, q2[1] +; CHECKBE-NEXT: vmov.16 q1[0], r0 +; CHECKBE-NEXT: vmov.u16 r0, q3[0] +; CHECKBE-NEXT: vmov.16 q1[1], r0 +; CHECKBE-NEXT: vmov.u16 r0, q2[3] +; CHECKBE-NEXT: vmov.16 q1[2], r0 +; CHECKBE-NEXT: vmov.u16 r0, q3[2] +; CHECKBE-NEXT: vmov.16 q1[3], r0 +; CHECKBE-NEXT: vmov.u16 r0, q2[5] +; CHECKBE-NEXT: vmov.16 q1[4], r0 +; CHECKBE-NEXT: vmov.u16 r0, q3[4] +; CHECKBE-NEXT: vmov.16 q1[5], r0 +; CHECKBE-NEXT: vmov.u16 r0, q2[7] +; CHECKBE-NEXT: vmov.16 q1[6], r0 +; CHECKBE-NEXT: vmov.u16 r0, q3[6] +; CHECKBE-NEXT: vmov.16 q1[7], r0 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> ret <8 x i16> %out @@ -279,6 +460,29 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-NEXT: vmov.u16 r0, q1[6] ; CHECK-NEXT: vmov.16 q0[7], r0 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_b3: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 q3, q0 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: vmov.u16 r0, q3[1] +; CHECKBE-NEXT: vmov.16 q2[0], r0 +; CHECKBE-NEXT: vmov.u16 r0, q0[0] +; CHECKBE-NEXT: vmov.16 q2[1], r0 +; CHECKBE-NEXT: vmov.u16 r0, q3[3] +; CHECKBE-NEXT: vmov.16 q2[2], r0 +; CHECKBE-NEXT: vmov.u16 r0, q0[2] +; CHECKBE-NEXT: vmov.16 q2[3], r0 +; CHECKBE-NEXT: vmov.u16 r0, q3[5] +; CHECKBE-NEXT: vmov.16 q2[4], r0 +; CHECKBE-NEXT: vmov.u16 r0, q0[4] +; CHECKBE-NEXT: vmov.16 q2[5], r0 +; CHECKBE-NEXT: vmov.u16 r0, q3[7] +; CHECKBE-NEXT: vmov.16 q2[6], r0 +; CHECKBE-NEXT: vmov.u16 r0, q0[6] +; CHECKBE-NEXT: vmov.16 q2[7], r0 +; CHECKBE-NEXT: vrev64.16 q0, q2 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> ret <8 x i16> %out @@ -289,6 +493,14 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn16_b4(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmovnb.i32 q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn16_b4: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.16 q2, q1 +; CHECKBE-NEXT: vrev64.16 q1, q0 +; CHECKBE-NEXT: vmovnb.i32 q1, q2 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> ret <8 x i16> %out @@ -300,6 +512,14 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn8_b1(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmovnt.i16 q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn8_b1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.8 q2, q1 +; CHECKBE-NEXT: vrev64.8 q1, q0 +; CHECKBE-NEXT: vmovnt.i16 q1, q2 +; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> ret <16 x i8> %out @@ -311,6 +531,14 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn8_b2(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-NEXT: vmovnt.i16 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn8_b2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.8 q2, q0 +; CHECKBE-NEXT: vrev64.8 q3, q1 +; CHECKBE-NEXT: vmovnt.i16 q3, q2 +; CHECKBE-NEXT: vrev64.8 q0, q3 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> ret <16 x i8> %out @@ -322,6 +550,14 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn8_t1(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-NEXT: vmovnb.i16 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn8_t1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.8 q2, q0 +; CHECKBE-NEXT: vrev64.8 q3, q1 +; CHECKBE-NEXT: vmovnb.i16 q3, q2 +; CHECKBE-NEXT: vrev64.8 q0, q3 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> ret <16 x i8> %out @@ -364,6 +600,45 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn8_t2(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-NEXT: vmov.u8 r0, q2[14] ; CHECK-NEXT: vmov.8 q0[15], r0 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn8_t2: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.8 q2, q1 +; CHECKBE-NEXT: vrev64.8 q3, q0 +; CHECKBE-NEXT: vmov.u8 r0, q2[1] +; CHECKBE-NEXT: vmov.8 q1[0], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[0] +; CHECKBE-NEXT: vmov.8 q1[1], r0 +; CHECKBE-NEXT: vmov.u8 r0, q2[3] +; CHECKBE-NEXT: vmov.8 q1[2], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[2] +; CHECKBE-NEXT: vmov.8 q1[3], r0 +; CHECKBE-NEXT: vmov.u8 r0, q2[5] +; CHECKBE-NEXT: vmov.8 q1[4], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[4] +; CHECKBE-NEXT: vmov.8 q1[5], r0 +; CHECKBE-NEXT: vmov.u8 r0, q2[7] +; CHECKBE-NEXT: vmov.8 q1[6], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[6] +; CHECKBE-NEXT: vmov.8 q1[7], r0 +; CHECKBE-NEXT: vmov.u8 r0, q2[9] +; CHECKBE-NEXT: vmov.8 q1[8], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[8] +; CHECKBE-NEXT: vmov.8 q1[9], r0 +; CHECKBE-NEXT: vmov.u8 r0, q2[11] +; CHECKBE-NEXT: vmov.8 q1[10], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[10] +; CHECKBE-NEXT: vmov.8 q1[11], r0 +; CHECKBE-NEXT: vmov.u8 r0, q2[13] +; CHECKBE-NEXT: vmov.8 q1[12], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[12] +; CHECKBE-NEXT: vmov.8 q1[13], r0 +; CHECKBE-NEXT: vmov.u8 r0, q2[15] +; CHECKBE-NEXT: vmov.8 q1[14], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[14] +; CHECKBE-NEXT: vmov.8 q1[15], r0 +; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> ret <16 x i8> %out @@ -406,6 +681,45 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn8_t3(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-NEXT: vmov.u8 r0, q1[14] ; CHECK-NEXT: vmov.8 q0[15], r0 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn8_t3: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.8 q3, q0 +; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: vmov.u8 r0, q3[1] +; CHECKBE-NEXT: vmov.8 q2[0], r0 +; CHECKBE-NEXT: vmov.u8 r0, q0[0] +; CHECKBE-NEXT: vmov.8 q2[1], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[3] +; CHECKBE-NEXT: vmov.8 q2[2], r0 +; CHECKBE-NEXT: vmov.u8 r0, q0[2] +; CHECKBE-NEXT: vmov.8 q2[3], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[5] +; CHECKBE-NEXT: vmov.8 q2[4], r0 +; CHECKBE-NEXT: vmov.u8 r0, q0[4] +; CHECKBE-NEXT: vmov.8 q2[5], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[7] +; CHECKBE-NEXT: vmov.8 q2[6], r0 +; CHECKBE-NEXT: vmov.u8 r0, q0[6] +; CHECKBE-NEXT: vmov.8 q2[7], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[9] +; CHECKBE-NEXT: vmov.8 q2[8], r0 +; CHECKBE-NEXT: vmov.u8 r0, q0[8] +; CHECKBE-NEXT: vmov.8 q2[9], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[11] +; CHECKBE-NEXT: vmov.8 q2[10], r0 +; CHECKBE-NEXT: vmov.u8 r0, q0[10] +; CHECKBE-NEXT: vmov.8 q2[11], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[13] +; CHECKBE-NEXT: vmov.8 q2[12], r0 +; CHECKBE-NEXT: vmov.u8 r0, q0[12] +; CHECKBE-NEXT: vmov.8 q2[13], r0 +; CHECKBE-NEXT: vmov.u8 r0, q3[15] +; CHECKBE-NEXT: vmov.8 q2[14], r0 +; CHECKBE-NEXT: vmov.u8 r0, q0[14] +; CHECKBE-NEXT: vmov.8 q2[15], r0 +; CHECKBE-NEXT: vrev64.8 q0, q2 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> ret <16 x i8> %out @@ -416,6 +730,14 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn8_t4(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmovnb.i16 q0, q1 ; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn8_t4: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.8 q2, q1 +; CHECKBE-NEXT: vrev64.8 q1, q0 +; CHECKBE-NEXT: vmovnb.i16 q1, q2 +; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> ret <16 x i8> %out -- 2.7.4