From 8d82f12ac3e8a6dae4e50d20da0c14fc30bfc7ee Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 31 May 2023 18:42:01 +0100 Subject: [PATCH] [ARM][AArch64] Add tests for shuffles load patterns. NFC See D151029 --- llvm/test/CodeGen/AArch64/insertshuffleload.ll | 478 +++++++++++++++++++++ llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll | 482 ++++++++++++++++++++++ 2 files changed, 960 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/insertshuffleload.ll create mode 100644 llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll diff --git a/llvm/test/CodeGen/AArch64/insertshuffleload.ll b/llvm/test/CodeGen/AArch64/insertshuffleload.ll new file mode 100644 index 0000000..c9bdb95 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/insertshuffleload.ll @@ -0,0 +1,478 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s + +define <8 x i8> @inserti8_first(ptr %p) { +; CHECK-LABEL: inserti8_first: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7 +; CHECK-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 1 + %l1 = load <8 x i8>, ptr %q + %l2 = load i8, ptr %p + %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> + %ins = insertelement <8 x i8> %s, i8 %l2, i32 0 + ret <8 x i8> %ins +} + +define <8 x i8> @inserti8_last(ptr %p) { +; CHECK-LABEL: inserti8_last: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: add x8, x0, #8 +; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #1 +; CHECK-NEXT: ld1 { v0.b }[7], [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 8 + %l1 = load <8 x i8>, ptr %p + %l2 = load i8, ptr %q + %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> + %ins = insertelement <8 x i8> %s, i8 %l2, i32 7 + ret <8 x i8> %ins +} + +define <8 x i16> @inserti8_first_sext(ptr %p) { +; CHECK-LABEL: inserti8_first_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ldrsb w8, [x0] +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #14 +; CHECK-NEXT: mov v0.h[0], w8 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 1 + %l1 = load <8 x i8>, ptr %q + %s1 = sext <8 x i8> %l1 to <8 x i16> + %l2 = load i8, ptr %p + %s2 = sext i8 %l2 to i16 + %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> + %ins = insertelement <8 x i16> %s, i16 %s2, i32 0 + ret <8 x i16> %ins +} + +define <8 x i16> @inserti8_last_sext(ptr %p) { +; CHECK-LABEL: inserti8_last_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldrsb w8, [x0, #8] +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #2 +; CHECK-NEXT: mov v0.h[7], w8 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 8 + %l1 = load <8 x i8>, ptr %p + %s1 = sext <8 x i8> %l1 to <8 x i16> + %l2 = load i8, ptr %q + %s2 = sext i8 %l2 to i16 + %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> + %ins = insertelement <8 x i16> %s, i16 %s2, i32 7 + ret <8 x i16> %ins +} + +define <8 x i16> @inserti8_first_zext(ptr %p) { +; CHECK-LABEL: inserti8_first_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #14 +; CHECK-NEXT: mov v0.h[0], w8 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 1 + %l1 = load <8 x i8>, ptr %q + %s1 = zext <8 x i8> %l1 to <8 x i16> + %l2 = load i8, ptr %p + %s2 = zext i8 %l2 to i16 + %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> + %ins = insertelement <8 x i16> %s, i16 %s2, i32 0 + ret <8 x i16> %ins +} + +define <8 x i16> @inserti8_last_zext(ptr %p) { +; CHECK-LABEL: inserti8_last_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldrb w8, [x0, #8] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #2 +; CHECK-NEXT: mov v0.h[7], w8 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 8 + %l1 = load <8 x i8>, ptr %p + %s1 = zext <8 x i8> %l1 to <8 x i16> + %l2 = load i8, ptr %q + %s2 = zext i8 %l2 to i16 + %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> + %ins = insertelement <8 x i16> %s, i16 %s2, i32 7 + ret <8 x i16> %ins +} + +define <8 x i32> @inserti32_first(ptr %p) { +; CHECK-LABEL: inserti32_first: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur q1, [x0, #4] +; CHECK-NEXT: ldur q2, [x0, #20] +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; CHECK-NEXT: ext v1.16b, v1.16b, v2.16b, #12 +; CHECK-NEXT: ld1 { v0.s }[0], [x0] +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 4 + %l1 = load <8 x i32>, ptr %q + %l2 = load i32, ptr %p + %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> + %ins = insertelement <8 x i32> %s, i32 %l2, i32 0 + ret <8 x i32> %ins +} + +define <8 x i32> @inserti32_last(ptr %p) { +; CHECK-LABEL: inserti32_last: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: add x8, x0, #32 +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: ext v0.16b, v2.16b, v0.16b, #4 +; CHECK-NEXT: ld1 { v1.s }[3], [x8] +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 32 + %l1 = load <8 x i32>, ptr %p + %l2 = load i32, ptr %q + %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> + %ins = insertelement <8 x i32> %s, i32 %l2, i32 7 + ret <8 x i32> %ins +} + +define <8 x i32> @inserti32_first_multiuse(ptr %p) { +; CHECK-LABEL: inserti32_first_multiuse: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur q0, [x0, #4] +; CHECK-NEXT: ldur q1, [x0, #20] +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #12 +; CHECK-NEXT: ext v3.16b, v0.16b, v1.16b, #12 +; CHECK-NEXT: ld1 { v2.s }[0], [x0] +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 4 + %l1 = load <8 x i32>, ptr %q + %l2 = load i32, ptr %p + %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> + %ins = insertelement <8 x i32> %s, i32 %l2, i32 0 + %a = add <8 x i32> %l1, %ins + ret <8 x i32> %a +} + +define <8 x i32> @inserti32_last_multiuse(ptr %p) { +; CHECK-LABEL: inserti32_last_multiuse: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: add x8, x0, #32 +; CHECK-NEXT: ext v2.16b, v1.16b, v0.16b, #4 +; CHECK-NEXT: ext v3.16b, v0.16b, v1.16b, #4 +; CHECK-NEXT: ld1 { v2.s }[3], [x8] +; CHECK-NEXT: add v0.4s, v0.4s, v3.4s +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 32 + %l1 = load <8 x i32>, ptr %p + %l2 = load i32, ptr %q + %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> + %ins = insertelement <8 x i32> %s, i32 %l2, i32 7 + %a = add <8 x i32> %l1, %ins + ret <8 x i32> %a +} + +define <4 x float> @insertf32_first(ptr %p) { +; CHECK-LABEL: insertf32_first: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur q0, [x0, #4] +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #12 +; CHECK-NEXT: ld1 { v0.s }[0], [x0] +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 4 + %l1 = load <4 x float>, ptr %q + %l2 = load float, ptr %p + %s = shufflevector <4 x float> %l1, <4 x float> undef, <4 x i32> + %ins = insertelement <4 x float> %s, float %l2, i32 0 + ret <4 x float> %ins +} + +define <4 x float> @insertf32_last(ptr %p) { +; CHECK-LABEL: insertf32_last: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: add x8, x0, #16 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: ld1 { v0.s }[3], [x8] +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 16 + %l1 = load <4 x float>, ptr %p + %l2 = load float, ptr %q + %s = shufflevector <4 x float> %l1, <4 x float> undef, <4 x i32> + %ins = insertelement <4 x float> %s, float %l2, i32 3 + ret <4 x float> %ins +} + +define <2 x i64> @inserti64_first(ptr %p) { +; CHECK-LABEL: inserti64_first: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, #8 +; CHECK-NEXT: ld1r { v0.2d }, [x8] +; CHECK-NEXT: ld1 { v0.d }[0], [x0] +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 8 + %l1 = load <2 x i64>, ptr %q + %l2 = load i64, ptr %p + %s = shufflevector <2 x i64> %l1, <2 x i64> undef, <2 x i32> + %ins = insertelement <2 x i64> %s, i64 %l2, i32 0 + ret <2 x i64> %ins +} + +define <2 x i64> @inserti64_last(ptr %p) { +; CHECK-LABEL: inserti64_last: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: add x8, x0, #16 +; CHECK-NEXT: dup v0.2d, v0.d[1] +; CHECK-NEXT: ld1 { v0.d }[1], [x8] +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 16 + %l1 = load <2 x i64>, ptr %p + %l2 = load i64, ptr %q + %s = shufflevector <2 x i64> %l1, <2 x i64> undef, <2 x i32> + %ins = insertelement <2 x i64> %s, i64 %l2, i32 1 + ret <2 x i64> %ins +} + +define <8 x i8> @inserti8_first_undef(ptr %p) { +; CHECK-LABEL: inserti8_first_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7 +; CHECK-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 1 + %l1 = load <8 x i8>, ptr %q + %l2 = load i8, ptr %p + %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> + %ins = insertelement <8 x i8> %s, i8 %l2, i32 0 + ret <8 x i8> %ins +} + +define <8 x i8> @inserti8_last_undef(ptr %p) { +; CHECK-LABEL: inserti8_last_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: add x8, x0, #8 +; CHECK-NEXT: dup v0.8b, v0.b[1] +; CHECK-NEXT: ld1 { v0.b }[7], [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 8 + %l1 = load <8 x i8>, ptr %p + %l2 = load i8, ptr %q + %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> + %ins = insertelement <8 x i8> %s, i8 %l2, i32 7 + ret <8 x i8> %ins +} + + + +define <8 x i16> @wrong_zextandsext(ptr %p) { +; CHECK-LABEL: wrong_zextandsext: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ldrsb w8, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #14 +; CHECK-NEXT: mov v0.h[0], w8 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 1 + %l1 = load <8 x i8>, ptr %q + %s1 = zext <8 x i8> %l1 to <8 x i16> + %l2 = load i8, ptr %p + %s2 = sext i8 %l2 to i16 + %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> + %ins = insertelement <8 x i16> %s, i16 %s2, i32 0 + ret <8 x i16> %ins +} + +define <8 x i8> @wrongidx_first(ptr %p) { +; CHECK-LABEL: wrongidx_first: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7 +; CHECK-NEXT: ld1 { v0.b }[7], [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 1 + %l1 = load <8 x i8>, ptr %q + %l2 = load i8, ptr %p + %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> + %ins = insertelement <8 x i8> %s, i8 %l2, i32 7 + ret <8 x i8> %ins +} + +define <8 x i8> @wrong_last(ptr %p) { +; CHECK-LABEL: wrong_last: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: add x8, x0, #8 +; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #1 +; CHECK-NEXT: ld1 { v0.b }[0], [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 8 + %l1 = load <8 x i8>, ptr %p + %l2 = load i8, ptr %q + %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> + %ins = insertelement <8 x i8> %s, i8 %l2, i32 0 + ret <8 x i8> %ins +} + +define <8 x i8> @wrong_shuffle(ptr %p) { +; CHECK-LABEL: wrong_shuffle: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: adrp x8, .LCPI19_0 +; CHECK-NEXT: mov v0.d[1], v0.d[0] +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI19_0] +; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b +; CHECK-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 1 + %l1 = load <8 x i8>, ptr %q + %l2 = load i8, ptr %p + %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> + %ins = insertelement <8 x i8> %s, i8 %l2, i32 0 + ret <8 x i8> %ins +} + +define <8 x i16> @wrong_exttype(ptr %p) { +; CHECK-LABEL: wrong_exttype: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #14 +; CHECK-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 1 + %l1 = load <8 x i8>, ptr %q + %s1 = sext <8 x i8> %l1 to <8 x i16> + %l2 = load i16, ptr %p + %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> + %ins = insertelement <8 x i16> %s, i16 %l2, i32 0 + ret <8 x i16> %ins +} + +define <4 x i32> @wrong_exttype2(ptr %p) { +; CHECK-LABEL: wrong_exttype2: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur s0, [x0, #1] +; CHECK-NEXT: ldrsh w8, [x0] +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #12 +; CHECK-NEXT: mov v0.s[0], w8 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 1 + %l1 = load <4 x i8>, ptr %q + %s1 = sext <4 x i8> %l1 to <4 x i32> + %l2 = load i16, ptr %p + %s2 = sext i16 %l2 to i32 + %s = shufflevector <4 x i32> %s1, <4 x i32> undef, <4 x i32> + %ins = insertelement <4 x i32> %s, i32 %s2, i32 0 + ret <4 x i32> %ins +} + +define <8 x i8> @wrong_offsetfirst(ptr %p) { +; CHECK-LABEL: wrong_offsetfirst: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #-1] +; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7 +; CHECK-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 -1 + %l1 = load <8 x i8>, ptr %q + %l2 = load i8, ptr %p + %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> + %ins = insertelement <8 x i8> %s, i8 %l2, i32 0 + ret <8 x i8> %ins +} + +define <8 x i8> @wrong_offsetlast(ptr %p) { +; CHECK-LABEL: wrong_offsetlast: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: add x8, x0, #7 +; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #1 +; CHECK-NEXT: ld1 { v0.b }[7], [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 7 + %l1 = load <8 x i8>, ptr %p + %l2 = load i8, ptr %q + %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> + %ins = insertelement <8 x i8> %s, i8 %l2, i32 7 + ret <8 x i8> %ins +} + + +define <8 x i8> @storebetween(ptr %p, ptr %r) { +; CHECK-LABEL: storebetween: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: strb wzr, [x1] +; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7 +; CHECK-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 1 + %l1 = load <8 x i8>, ptr %q + store i8 0, ptr %r + %l2 = load i8, ptr %p + %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> + %ins = insertelement <8 x i8> %s, i8 %l2, i32 0 + ret <8 x i8> %ins +} + +define <8 x i8> @storebefore(ptr %p, ptr %r) { +; CHECK-LABEL: storebefore: +; CHECK: // %bb.0: +; CHECK-NEXT: strb wzr, [x1] +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7 +; CHECK-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 1 + store i8 0, ptr %r + %l1 = load <8 x i8>, ptr %q + %l2 = load i8, ptr %p + %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> + %ins = insertelement <8 x i8> %s, i8 %l2, i32 0 + ret <8 x i8> %ins +} + +define <8 x i8> @storeafter(ptr %p, ptr %r) { +; CHECK-LABEL: storeafter: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7 +; CHECK-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-NEXT: strb wzr, [x1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %q = getelementptr inbounds i8, ptr %p, i32 1 + %l1 = load <8 x i8>, ptr %q + %l2 = load i8, ptr %p + store i8 0, ptr %r + %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> + %ins = insertelement <8 x i8> %s, i8 %l2, i32 0 + ret <8 x i8> %ins +} diff --git a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll new file mode 100644 index 0000000..7714f8d --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll @@ -0,0 +1,482 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=thumbv8.1m.main-none-eabihf -mattr=+mve.fp | FileCheck %s --check-prefix=CHECKLE +; RUN: llc < %s -mtriple=thumbebv8.1m.main-none-eabihf -mattr=+mve.fp | FileCheck %s --check-prefix=CHECKBE + + +define <8 x i8> @inserti8_first(ptr %p) { +; CHECKLE-LABEL: inserti8_first: +; CHECKLE: @ %bb.0: +; CHECKLE-NEXT: vldrb.u16 q1, [r0, #1] +; CHECKLE-NEXT: ldrb r1, [r0] +; CHECKLE-NEXT: vmovx.f16 s10, s5 +; CHECKLE-NEXT: vmovx.f16 s8, s4 +; CHECKLE-NEXT: vins.f16 s10, s6 +; CHECKLE-NEXT: vmovx.f16 s6, s6 +; CHECKLE-NEXT: vmov.16 q0[0], r1 +; CHECKLE-NEXT: vins.f16 s8, s5 +; CHECKLE-NEXT: vins.f16 s6, s7 +; CHECKLE-NEXT: vmov.f32 s1, s8 +; CHECKLE-NEXT: vmov.f32 s2, s10 +; CHECKLE-NEXT: vins.f16 s0, s4 +; CHECKLE-NEXT: vmov.f32 s3, s6 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: inserti8_first: +; CHECKBE: @ %bb.0: +; CHECKBE-NEXT: vldrb.u16 q0, [r0, #1] +; CHECKBE-NEXT: ldrb r1, [r0] +; CHECKBE-NEXT: vmovx.f16 s6, s1 +; CHECKBE-NEXT: vmovx.f16 s4, s0 +; CHECKBE-NEXT: vins.f16 s6, s2 +; CHECKBE-NEXT: vmovx.f16 s2, s2 +; CHECKBE-NEXT: vmov.16 q2[0], r1 +; CHECKBE-NEXT: vins.f16 s4, s1 +; CHECKBE-NEXT: vins.f16 s2, s3 +; CHECKBE-NEXT: vins.f16 s8, s0 +; CHECKBE-NEXT: vmov.f32 s9, s4 +; CHECKBE-NEXT: vmov.f32 s10, s6 +; CHECKBE-NEXT: vmov.f32 s11, s2 +; CHECKBE-NEXT: vrev64.16 q0, q2 +; CHECKBE-NEXT: bx lr + %q = getelementptr inbounds i8, ptr %p, i32 1 + %l1 = load <8 x i8>, ptr %q + %l2 = load i8, ptr %p + %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> + %ins = insertelement <8 x i8> %s, i8 %l2, i32 0 + ret <8 x i8> %ins +} + +define <8 x i8> @inserti8_last(ptr %p) { +; CHECKLE-LABEL: inserti8_last: +; CHECKLE: @ %bb.0: +; CHECKLE-NEXT: vldrb.u16 q1, [r0] +; CHECKLE-NEXT: ldrb r1, [r0, #8] +; CHECKLE-NEXT: vmovx.f16 s0, s4 +; CHECKLE-NEXT: vmovx.f16 s1, s5 +; CHECKLE-NEXT: vmovx.f16 s2, s6 +; CHECKLE-NEXT: vins.f16 s0, s5 +; CHECKLE-NEXT: vins.f16 s1, s6 +; CHECKLE-NEXT: vins.f16 s2, s7 +; CHECKLE-NEXT: vmov.u16 r0, q1[7] +; CHECKLE-NEXT: vmov.16 q0[6], r0 +; CHECKLE-NEXT: vmov.16 q0[7], r1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: inserti8_last: +; CHECKBE: @ %bb.0: +; CHECKBE-NEXT: vldrb.u16 q0, [r0] +; CHECKBE-NEXT: ldrb r1, [r0, #8] +; CHECKBE-NEXT: vmovx.f16 s4, s0 +; CHECKBE-NEXT: vmovx.f16 s5, s1 +; CHECKBE-NEXT: vmovx.f16 s6, s2 +; CHECKBE-NEXT: vins.f16 s4, s1 +; CHECKBE-NEXT: vins.f16 s5, s2 +; CHECKBE-NEXT: vins.f16 s6, s3 +; CHECKBE-NEXT: vmov.u16 r0, q0[7] +; CHECKBE-NEXT: vmov.16 q1[6], r0 +; CHECKBE-NEXT: vmov.16 q1[7], r1 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr + %q = getelementptr inbounds i8, ptr %p, i32 8 + %l1 = load <8 x i8>, ptr %p + %l2 = load i8, ptr %q + %s = shufflevector <8 x i8> %l1, <8 x i8> undef, <8 x i32> + %ins = insertelement <8 x i8> %s, i8 %l2, i32 7 + ret <8 x i8> %ins +} + +define <8 x i16> @inserti8_first_sext(ptr %p) { +; CHECKLE-LABEL: inserti8_first_sext: +; CHECKLE: @ %bb.0: +; CHECKLE-NEXT: vldrb.s16 q1, [r0, #1] +; CHECKLE-NEXT: ldrsb.w r1, [r0] +; CHECKLE-NEXT: vmovx.f16 s10, s5 +; CHECKLE-NEXT: vmovx.f16 s8, s4 +; CHECKLE-NEXT: vins.f16 s10, s6 +; CHECKLE-NEXT: vmovx.f16 s6, s6 +; CHECKLE-NEXT: vmov.16 q0[0], r1 +; CHECKLE-NEXT: vins.f16 s8, s5 +; CHECKLE-NEXT: vins.f16 s6, s7 +; CHECKLE-NEXT: vmov.f32 s1, s8 +; CHECKLE-NEXT: vmov.f32 s2, s10 +; CHECKLE-NEXT: vins.f16 s0, s4 +; CHECKLE-NEXT: vmov.f32 s3, s6 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: inserti8_first_sext: +; CHECKBE: @ %bb.0: +; CHECKBE-NEXT: vldrb.s16 q0, [r0, #1] +; CHECKBE-NEXT: ldrsb.w r1, [r0] +; CHECKBE-NEXT: vmovx.f16 s6, s1 +; CHECKBE-NEXT: vmovx.f16 s4, s0 +; CHECKBE-NEXT: vins.f16 s6, s2 +; CHECKBE-NEXT: vmovx.f16 s2, s2 +; CHECKBE-NEXT: vmov.16 q2[0], r1 +; CHECKBE-NEXT: vins.f16 s4, s1 +; CHECKBE-NEXT: vins.f16 s2, s3 +; CHECKBE-NEXT: vins.f16 s8, s0 +; CHECKBE-NEXT: vmov.f32 s9, s4 +; CHECKBE-NEXT: vmov.f32 s10, s6 +; CHECKBE-NEXT: vmov.f32 s11, s2 +; CHECKBE-NEXT: vrev64.16 q0, q2 +; CHECKBE-NEXT: bx lr + %q = getelementptr inbounds i8, ptr %p, i32 1 + %l1 = load <8 x i8>, ptr %q + %s1 = sext <8 x i8> %l1 to <8 x i16> + %l2 = load i8, ptr %p + %s2 = sext i8 %l2 to i16 + %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> + %ins = insertelement <8 x i16> %s, i16 %s2, i32 0 + ret <8 x i16> %ins +} + +define <8 x i16> @inserti8_last_sext(ptr %p) { +; CHECKLE-LABEL: inserti8_last_sext: +; CHECKLE: @ %bb.0: +; CHECKLE-NEXT: vldrb.s16 q1, [r0] +; CHECKLE-NEXT: ldrsb.w r1, [r0, #8] +; CHECKLE-NEXT: vmovx.f16 s0, s4 +; CHECKLE-NEXT: vmovx.f16 s1, s5 +; CHECKLE-NEXT: vmovx.f16 s2, s6 +; CHECKLE-NEXT: vins.f16 s0, s5 +; CHECKLE-NEXT: vins.f16 s1, s6 +; CHECKLE-NEXT: vins.f16 s2, s7 +; CHECKLE-NEXT: vmov.u16 r0, q1[7] +; CHECKLE-NEXT: vmov.16 q0[6], r0 +; CHECKLE-NEXT: vmov.16 q0[7], r1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: inserti8_last_sext: +; CHECKBE: @ %bb.0: +; CHECKBE-NEXT: vldrb.s16 q0, [r0] +; CHECKBE-NEXT: ldrsb.w r1, [r0, #8] +; CHECKBE-NEXT: vmovx.f16 s4, s0 +; CHECKBE-NEXT: vmovx.f16 s5, s1 +; CHECKBE-NEXT: vmovx.f16 s6, s2 +; CHECKBE-NEXT: vins.f16 s4, s1 +; CHECKBE-NEXT: vins.f16 s5, s2 +; CHECKBE-NEXT: vins.f16 s6, s3 +; CHECKBE-NEXT: vmov.u16 r0, q0[7] +; CHECKBE-NEXT: vmov.16 q1[6], r0 +; CHECKBE-NEXT: vmov.16 q1[7], r1 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr + %q = getelementptr inbounds i8, ptr %p, i32 8 + %l1 = load <8 x i8>, ptr %p + %s1 = sext <8 x i8> %l1 to <8 x i16> + %l2 = load i8, ptr %q + %s2 = sext i8 %l2 to i16 + %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> + %ins = insertelement <8 x i16> %s, i16 %s2, i32 7 + ret <8 x i16> %ins +} + +define <8 x i16> @inserti8_first_zext(ptr %p) { +; CHECKLE-LABEL: inserti8_first_zext: +; CHECKLE: @ %bb.0: +; CHECKLE-NEXT: vldrb.u16 q1, [r0, #1] +; CHECKLE-NEXT: ldrb r1, [r0] +; CHECKLE-NEXT: vmovx.f16 s10, s5 +; CHECKLE-NEXT: vmovx.f16 s8, s4 +; CHECKLE-NEXT: vins.f16 s10, s6 +; CHECKLE-NEXT: vmovx.f16 s6, s6 +; CHECKLE-NEXT: vmov.16 q0[0], r1 +; CHECKLE-NEXT: vins.f16 s8, s5 +; CHECKLE-NEXT: vins.f16 s6, s7 +; CHECKLE-NEXT: vmov.f32 s1, s8 +; CHECKLE-NEXT: vmov.f32 s2, s10 +; CHECKLE-NEXT: vins.f16 s0, s4 +; CHECKLE-NEXT: vmov.f32 s3, s6 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: inserti8_first_zext: +; CHECKBE: @ %bb.0: +; CHECKBE-NEXT: vldrb.u16 q0, [r0, #1] +; CHECKBE-NEXT: ldrb r1, [r0] +; CHECKBE-NEXT: vmovx.f16 s6, s1 +; CHECKBE-NEXT: vmovx.f16 s4, s0 +; CHECKBE-NEXT: vins.f16 s6, s2 +; CHECKBE-NEXT: vmovx.f16 s2, s2 +; CHECKBE-NEXT: vmov.16 q2[0], r1 +; CHECKBE-NEXT: vins.f16 s4, s1 +; CHECKBE-NEXT: vins.f16 s2, s3 +; CHECKBE-NEXT: vins.f16 s8, s0 +; CHECKBE-NEXT: vmov.f32 s9, s4 +; CHECKBE-NEXT: vmov.f32 s10, s6 +; CHECKBE-NEXT: vmov.f32 s11, s2 +; CHECKBE-NEXT: vrev64.16 q0, q2 +; CHECKBE-NEXT: bx lr + %q = getelementptr inbounds i8, ptr %p, i32 1 + %l1 = load <8 x i8>, ptr %q + %s1 = zext <8 x i8> %l1 to <8 x i16> + %l2 = load i8, ptr %p + %s2 = zext i8 %l2 to i16 + %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> + %ins = insertelement <8 x i16> %s, i16 %s2, i32 0 + ret <8 x i16> %ins +} + +define <8 x i16> @inserti8_last_zext(ptr %p) { +; CHECKLE-LABEL: inserti8_last_zext: +; CHECKLE: @ %bb.0: +; CHECKLE-NEXT: vldrb.u16 q1, [r0] +; CHECKLE-NEXT: ldrb r1, [r0, #8] +; CHECKLE-NEXT: vmovx.f16 s0, s4 +; CHECKLE-NEXT: vmovx.f16 s1, s5 +; CHECKLE-NEXT: vmovx.f16 s2, s6 +; CHECKLE-NEXT: vins.f16 s0, s5 +; CHECKLE-NEXT: vins.f16 s1, s6 +; CHECKLE-NEXT: vins.f16 s2, s7 +; CHECKLE-NEXT: vmov.u16 r0, q1[7] +; CHECKLE-NEXT: vmov.16 q0[6], r0 +; CHECKLE-NEXT: vmov.16 q0[7], r1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: inserti8_last_zext: +; CHECKBE: @ %bb.0: +; CHECKBE-NEXT: vldrb.u16 q0, [r0] +; CHECKBE-NEXT: ldrb r1, [r0, #8] +; CHECKBE-NEXT: vmovx.f16 s4, s0 +; CHECKBE-NEXT: vmovx.f16 s5, s1 +; CHECKBE-NEXT: vmovx.f16 s6, s2 +; CHECKBE-NEXT: vins.f16 s4, s1 +; CHECKBE-NEXT: vins.f16 s5, s2 +; CHECKBE-NEXT: vins.f16 s6, s3 +; CHECKBE-NEXT: vmov.u16 r0, q0[7] +; CHECKBE-NEXT: vmov.16 q1[6], r0 +; CHECKBE-NEXT: vmov.16 q1[7], r1 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr + %q = getelementptr inbounds i8, ptr %p, i32 8 + %l1 = load <8 x i8>, ptr %p + %s1 = zext <8 x i8> %l1 to <8 x i16> + %l2 = load i8, ptr %q + %s2 = zext i8 %l2 to i16 + %s = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> + %ins = insertelement <8 x i16> %s, i16 %s2, i32 7 + ret <8 x i16> %ins +} + +define <8 x i32> @inserti32_first(ptr %p) { +; CHECKLE-LABEL: inserti32_first: +; CHECKLE: @ %bb.0: +; CHECKLE-NEXT: vldrw.u32 q1, [r0, #4] +; CHECKLE-NEXT: vldrw.u32 q2, [r0, #20] +; CHECKLE-NEXT: ldr r1, [r0] +; CHECKLE-NEXT: vmov.f32 s1, s4 +; CHECKLE-NEXT: vmov.f32 s2, s5 +; CHECKLE-NEXT: vmov.f32 s3, s6 +; CHECKLE-NEXT: vmov.f32 s4, s7 +; CHECKLE-NEXT: vmov.32 q0[0], r1 +; CHECKLE-NEXT: vmov.f32 s5, s8 +; CHECKLE-NEXT: vmov.f32 s6, s9 +; CHECKLE-NEXT: vmov.f32 s7, s10 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: inserti32_first: +; CHECKBE: @ %bb.0: +; CHECKBE-NEXT: vldrw.u32 q0, [r0, #20] +; CHECKBE-NEXT: vldrw.u32 q2, [r0, #4] +; CHECKBE-NEXT: ldr r1, [r0] +; CHECKBE-NEXT: vmov.f32 s12, s11 +; CHECKBE-NEXT: vmov.f32 s13, s0 +; CHECKBE-NEXT: vmov.f32 s14, s1 +; CHECKBE-NEXT: vmov.f32 s15, s2 +; CHECKBE-NEXT: vrev64.32 q1, q3 +; CHECKBE-NEXT: vmov.f32 s13, s8 +; CHECKBE-NEXT: vmov.f32 s14, s9 +; CHECKBE-NEXT: vmov.f32 s15, s10 +; CHECKBE-NEXT: vmov.32 q3[0], r1 +; CHECKBE-NEXT: vrev64.32 q0, q3 +; CHECKBE-NEXT: bx lr + %q = getelementptr inbounds i8, ptr %p, i32 4 + %l1 = load <8 x i32>, ptr %q + %l2 = load i32, ptr %p + %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> + %ins = insertelement <8 x i32> %s, i32 %l2, i32 0 + ret <8 x i32> %ins +} + +define <8 x i32> @inserti32_last(ptr %p) { +; CHECKLE-LABEL: inserti32_last: +; CHECKLE: @ %bb.0: +; CHECKLE-NEXT: vldrw.u32 q2, [r0, #16] +; CHECKLE-NEXT: vldrw.u32 q0, [r0] +; CHECKLE-NEXT: ldr r1, [r0, #32] +; CHECKLE-NEXT: vmov.f32 s0, s1 +; CHECKLE-NEXT: vmov.f32 s1, s2 +; CHECKLE-NEXT: vmov.f32 s2, s3 +; CHECKLE-NEXT: vmov.f32 s3, s8 +; CHECKLE-NEXT: vmov.f32 s4, s9 +; CHECKLE-NEXT: vmov.f32 s5, s10 +; CHECKLE-NEXT: vmov.f32 s6, s11 +; CHECKLE-NEXT: vmov.32 q1[3], r1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: inserti32_last: +; CHECKBE: @ %bb.0: +; CHECKBE-NEXT: vldrw.u32 q0, [r0] +; CHECKBE-NEXT: vldrw.u32 q1, [r0, #16] +; CHECKBE-NEXT: ldr r1, [r0, #32] +; CHECKBE-NEXT: vmov.f32 s8, s1 +; CHECKBE-NEXT: vmov.f32 s9, s2 +; CHECKBE-NEXT: vmov.f32 s10, s3 +; CHECKBE-NEXT: vmov.f32 s11, s4 +; CHECKBE-NEXT: vrev64.32 q0, q2 +; CHECKBE-NEXT: vmov.f32 s8, s5 +; CHECKBE-NEXT: vmov.f32 s9, s6 +; CHECKBE-NEXT: vmov.f32 s10, s7 +; CHECKBE-NEXT: vmov.32 q2[3], r1 +; CHECKBE-NEXT: vrev64.32 q1, q2 +; CHECKBE-NEXT: bx lr + %q = getelementptr inbounds i8, ptr %p, i32 32 + %l1 = load <8 x i32>, ptr %p + %l2 = load i32, ptr %q + %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> + %ins = insertelement <8 x i32> %s, i32 %l2, i32 7 + ret <8 x i32> %ins +} + +define <8 x i32> @inserti32_first_multiuse(ptr %p) { +; CHECKLE-LABEL: inserti32_first_multiuse: +; CHECKLE: @ %bb.0: +; CHECKLE-NEXT: vldrw.u32 q1, [r0, #20] +; CHECKLE-NEXT: vldrw.u32 q0, [r0, #4] +; CHECKLE-NEXT: ldr r1, [r0] +; CHECKLE-NEXT: vmov.f32 s8, s3 +; CHECKLE-NEXT: vmov.f32 s9, s4 +; CHECKLE-NEXT: vmov.f32 s10, s5 +; CHECKLE-NEXT: vmov.f32 s11, s6 +; CHECKLE-NEXT: vadd.i32 q1, q1, q2 +; CHECKLE-NEXT: vmov.f32 s9, s0 +; CHECKLE-NEXT: vmov.f32 s10, s1 +; CHECKLE-NEXT: vmov.f32 s11, s2 +; CHECKLE-NEXT: vmov.32 q2[0], r1 +; CHECKLE-NEXT: vadd.i32 q0, q0, q2 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: inserti32_first_multiuse: +; CHECKBE: @ %bb.0: +; CHECKBE-NEXT: vldrw.u32 q1, [r0, #20] +; CHECKBE-NEXT: vldrw.u32 q0, [r0, #4] +; CHECKBE-NEXT: ldr r1, [r0] +; CHECKBE-NEXT: vmov.f32 s8, s3 +; CHECKBE-NEXT: vmov.f32 s9, s4 +; CHECKBE-NEXT: vmov.f32 s10, s5 +; CHECKBE-NEXT: vmov.f32 s11, s6 +; CHECKBE-NEXT: vadd.i32 q2, q1, q2 +; CHECKBE-NEXT: vrev64.32 q1, q2 +; CHECKBE-NEXT: vmov.f32 s9, s0 +; CHECKBE-NEXT: vmov.f32 s10, s1 +; CHECKBE-NEXT: vmov.f32 s11, s2 +; CHECKBE-NEXT: vmov.32 q2[0], r1 +; CHECKBE-NEXT: vadd.i32 q2, q0, q2 +; CHECKBE-NEXT: vrev64.32 q0, q2 +; CHECKBE-NEXT: bx lr + %q = getelementptr inbounds i8, ptr %p, i32 4 + %l1 = load <8 x i32>, ptr %q + %l2 = load i32, ptr %p + %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> + %ins = insertelement <8 x i32> %s, i32 %l2, i32 0 + %a = add <8 x i32> %l1, %ins + ret <8 x i32> %a +} + +define <8 x i32> @inserti32_last_multiuse(ptr %p) { +; CHECKLE-LABEL: inserti32_last_multiuse: +; CHECKLE: @ %bb.0: +; CHECKLE-NEXT: vldrw.u32 q0, [r0] +; CHECKLE-NEXT: vldrw.u32 q1, [r0, #16] +; CHECKLE-NEXT: ldr r1, [r0, #32] +; CHECKLE-NEXT: vmov.f32 s8, s1 +; CHECKLE-NEXT: vmov.f32 s9, s2 +; CHECKLE-NEXT: vmov.f32 s10, s3 +; CHECKLE-NEXT: vmov.f32 s11, s4 +; CHECKLE-NEXT: vadd.i32 q0, q0, q2 +; CHECKLE-NEXT: vmov.f32 s8, s5 +; CHECKLE-NEXT: vmov.f32 s9, s6 +; CHECKLE-NEXT: vmov.f32 s10, s7 +; CHECKLE-NEXT: vmov.32 q2[3], r1 +; CHECKLE-NEXT: vadd.i32 q1, q1, q2 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: inserti32_last_multiuse: +; CHECKBE: @ %bb.0: +; CHECKBE-NEXT: vldrw.u32 q0, [r0] +; CHECKBE-NEXT: vldrw.u32 q1, [r0, #16] +; CHECKBE-NEXT: ldr r1, [r0, #32] +; CHECKBE-NEXT: vmov.f32 s8, s1 +; CHECKBE-NEXT: vmov.f32 s9, s2 +; CHECKBE-NEXT: vmov.f32 s10, s3 +; CHECKBE-NEXT: vmov.f32 s11, s4 +; CHECKBE-NEXT: vadd.i32 q2, q0, q2 +; CHECKBE-NEXT: vrev64.32 q0, q2 +; CHECKBE-NEXT: vmov.f32 s8, s5 +; CHECKBE-NEXT: vmov.f32 s9, s6 +; CHECKBE-NEXT: vmov.f32 s10, s7 +; CHECKBE-NEXT: vmov.32 q2[3], r1 +; CHECKBE-NEXT: vadd.i32 q2, q1, q2 +; CHECKBE-NEXT: vrev64.32 q1, q2 +; CHECKBE-NEXT: bx lr + %q = getelementptr inbounds i8, ptr %p, i32 32 + %l1 = load <8 x i32>, ptr %p + %l2 = load i32, ptr %q + %s = shufflevector <8 x i32> %l1, <8 x i32> undef, <8 x i32> + %ins = insertelement <8 x i32> %s, i32 %l2, i32 7 + %a = add <8 x i32> %l1, %ins + ret <8 x i32> %a +} + +define <4 x float> @insertf32_first(ptr %p) { +; CHECKLE-LABEL: insertf32_first: +; CHECKLE: @ %bb.0: +; CHECKLE-NEXT: vldrw.u32 q1, [r0, #4] +; CHECKLE-NEXT: vldr s0, [r0] +; CHECKLE-NEXT: vmov.f32 s1, s4 +; CHECKLE-NEXT: vmov.f32 s2, s5 +; CHECKLE-NEXT: vmov.f32 s3, s6 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: insertf32_first: +; CHECKBE: @ %bb.0: +; CHECKBE-NEXT: vldrw.u32 q0, [r0, #4] +; CHECKBE-NEXT: vldr s4, [r0] +; CHECKBE-NEXT: vmov.f32 s5, s0 +; CHECKBE-NEXT: vmov.f32 s6, s1 +; CHECKBE-NEXT: vmov.f32 s7, s2 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr + %q = getelementptr inbounds i8, ptr %p, i32 4 + %l1 = load <4 x float>, ptr %q + %l2 = load float, ptr %p + %s = shufflevector <4 x float> %l1, <4 x float> undef, <4 x i32> + %ins = insertelement <4 x float> %s, float %l2, i32 0 + ret <4 x float> %ins +} + +define <4 x float> @insertf32_last(ptr %p) { +; CHECKLE-LABEL: insertf32_last: +; CHECKLE: @ %bb.0: +; CHECKLE-NEXT: vldrw.u32 q1, [r0] +; CHECKLE-NEXT: vldr s3, [r0, #16] +; CHECKLE-NEXT: vmov.f32 s0, s5 +; CHECKLE-NEXT: vmov.f32 s1, s6 +; CHECKLE-NEXT: vmov.f32 s2, s7 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: insertf32_last: +; CHECKBE: @ %bb.0: +; CHECKBE-NEXT: vldrw.u32 q0, [r0] +; CHECKBE-NEXT: vldr s7, [r0, #16] +; CHECKBE-NEXT: vmov.f32 s4, s1 +; CHECKBE-NEXT: vmov.f32 s5, s2 +; CHECKBE-NEXT: vmov.f32 s6, s3 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr + %q = getelementptr inbounds i8, ptr %p, i32 16 + %l1 = load <4 x float>, ptr %p + %l2 = load float, ptr %q + %s = shufflevector <4 x float> %l1, <4 x float> undef, <4 x i32> + %ins = insertelement <4 x float> %s, float %l2, i32 3 + ret <4 x float> %ins +} -- 2.7.4