From ef30e57469f1ee53ba44dab86d1975decd8da852 Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 4 Feb 2023 13:10:45 +0000 Subject: [PATCH] [AArch64] Add tests for inefficient generation of ST2s. NFC --- llvm/test/CodeGen/AArch64/vldn_shuffle.ll | 410 ++++++++++++++++++++++++++++++ 1 file changed, 410 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll index 693ad10..a54c5e9 100644 --- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -294,3 +294,413 @@ vector.body: ; preds = %vector.body, %entry while.end: ; preds = %vector.body ret void } + +; This example has store(shuffle(shuffle(... that would be better to be treated +; as a single store. This avoids the vld2 for data that is already shuffled. +define void @transpose_s16_8x8_simpler(ptr nocapture noundef %a) { +; CHECK-LABEL: transpose_s16_8x8_simpler: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: ldp q2, q3, [x0, #32] +; CHECK-NEXT: trn1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ldp q5, q6, [x0, #80] +; CHECK-NEXT: trn1 v2.8h, v2.8h, v3.8h +; CHECK-NEXT: ldr q4, [x8, #64]! +; CHECK-NEXT: ldr q1, [x0, #112] +; CHECK-NEXT: trn1 v3.8h, v4.8h, v5.8h +; CHECK-NEXT: trn1 v1.8h, v6.8h, v1.8h +; CHECK-NEXT: trn1 v3.4s, v0.4s, v3.4s +; CHECK-NEXT: trn1 v4.4s, v2.4s, v1.4s +; CHECK-NEXT: ext v0.16b, v3.16b, v3.16b, #8 +; CHECK-NEXT: ext v1.16b, v4.16b, v4.16b, #8 +; CHECK-NEXT: st2 { v3.2s, v4.2s }, [x0] +; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x8] +; CHECK-NEXT: ret +entry: + %0 = load <8 x i16>, ptr %a, align 16 + %arrayidx1 = getelementptr inbounds <8 x i16>, ptr %a, i64 1 + %1 = load <8 x i16>, ptr %arrayidx1, align 16 + %shuffle.i = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> + %arrayidx2 = getelementptr inbounds <8 x i16>, ptr %a, i64 2 + %2 = load <8 x i16>, ptr %arrayidx2, align 16 + %arrayidx3 = getelementptr inbounds <8 x i16>, ptr %a, i64 3 + %3 = load <8 x i16>, ptr %arrayidx3, align 16 + %shuffle.i34 = shufflevector <8 x i16> %2, <8 x i16> %3, <8 x i32> + %arrayidx5 = getelementptr inbounds <8 x i16>, ptr %a, i64 4 + %4 = load <8 x i16>, ptr %arrayidx5, align 16 + %arrayidx6 = getelementptr inbounds <8 x i16>, ptr %a, i64 5 + %5 = load <8 x i16>, ptr %arrayidx6, align 16 + %shuffle.i35 = shufflevector <8 x i16> %4, <8 x i16> %5, <8 x i32> + %arrayidx8 = getelementptr inbounds <8 x i16>, ptr %a, i64 6 + %6 = load <8 x i16>, ptr %arrayidx8, align 16 + %arrayidx9 = getelementptr inbounds <8 x i16>, ptr %a, i64 7 + %7 = load <8 x i16>, ptr %arrayidx9, align 16 + %shuffle.i36 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> + %8 = bitcast <8 x i16> %shuffle.i to <4 x i32> + %9 = bitcast <8 x i16> %shuffle.i35 to <4 x i32> + %shuffle.i37 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> + %10 = bitcast <8 x i16> %shuffle.i34 to <4 x i32> + %11 = bitcast <8 x i16> %shuffle.i36 to <4 x i32> + %shuffle.i38 = shufflevector <4 x i32> %10, <4 x i32> %11, <4 x i32> + %vzip.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> + %vzip1.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> + store <4 x i32> %vzip.i, ptr %a, align 16 + store <4 x i32> %vzip1.i, ptr %arrayidx5, align 16 + ret void +} + +; Same as above with some different shuffles +define void @transpose_s16_8x8_simpler2(ptr nocapture noundef %a) { +; CHECK-LABEL: transpose_s16_8x8_simpler2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: ldp q3, q4, [x0, #32] +; CHECK-NEXT: mov v0.h[5], v2.h[4] +; CHECK-NEXT: ldp q6, q7, [x0, #80] +; CHECK-NEXT: zip1 v3.8h, v3.8h, v4.8h +; CHECK-NEXT: ldr q5, [x8, #64]! +; CHECK-NEXT: ldr q2, [x0, #112] +; CHECK-NEXT: zip1 v4.8h, v5.8h, v6.8h +; CHECK-NEXT: mov v7.h[5], v2.h[4] +; CHECK-NEXT: mov v0.s[1], v4.s[0] +; CHECK-NEXT: uzp1 v1.4s, v3.4s, v7.4s +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x0] +; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x8] +; CHECK-NEXT: ret +entry: + %0 = load <8 x i16>, ptr %a, align 16 + %arrayidx1 = getelementptr inbounds <8 x i16>, ptr %a, i64 1 + %1 = load <8 x i16>, ptr %arrayidx1, align 16 + %shuffle.i = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> + %arrayidx2 = getelementptr inbounds <8 x i16>, ptr %a, i64 2 + %2 = load <8 x i16>, ptr %arrayidx2, align 16 + %arrayidx3 = getelementptr inbounds <8 x i16>, ptr %a, i64 3 + %3 = load <8 x i16>, ptr %arrayidx3, align 16 + %shuffle.i34 = shufflevector <8 x i16> %2, <8 x i16> %3, <8 x i32> + %arrayidx5 = getelementptr inbounds <8 x i16>, ptr %a, i64 4 + %4 = load <8 x i16>, ptr %arrayidx5, align 16 + %arrayidx6 = getelementptr inbounds <8 x i16>, ptr %a, i64 5 + %5 = load <8 x i16>, ptr %arrayidx6, align 16 + %shuffle.i35 = shufflevector <8 x i16> %4, <8 x i16> %5, <8 x i32> + %arrayidx8 = getelementptr inbounds <8 x i16>, ptr %a, i64 6 + %6 = load <8 x i16>, ptr %arrayidx8, align 16 + %arrayidx9 = getelementptr inbounds <8 x i16>, ptr %a, i64 7 + %7 = load <8 x i16>, ptr %arrayidx9, align 16 + %shuffle.i36 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> + %8 = bitcast <8 x i16> %shuffle.i to <4 x i32> + %9 = bitcast <8 x i16> %shuffle.i35 to <4 x i32> + %shuffle.i37 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> + %10 = bitcast <8 x i16> %shuffle.i34 to <4 x i32> + %11 = bitcast <8 x i16> %shuffle.i36 to <4 x i32> + %shuffle.i38 = shufflevector <4 x i32> %10, <4 x i32> %11, <4 x i32> + %vzip.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> + %vzip1.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> + store <4 x i32> %vzip.i, ptr %a, align 16 + store <4 x i32> %vzip1.i, ptr %arrayidx5, align 16 + ret void +} + + +define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %1, ptr nocapture noundef %2, ptr nocapture noundef %3, ptr nocapture noundef %4, ptr nocapture noundef %5, ptr nocapture noundef %6, ptr nocapture noundef %7) { +; CHECK-LABEL: transpose_s16_8x8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q0, [x4] +; CHECK-NEXT: ldr q3, [x2] +; CHECK-NEXT: ldr q4, [x3] +; CHECK-NEXT: ldr q6, [x5] +; CHECK-NEXT: trn1 v5.8h, v1.8h, v2.8h +; CHECK-NEXT: ldr q16, [x7] +; CHECK-NEXT: trn2 v1.8h, v1.8h, v2.8h +; CHECK-NEXT: ldr q2, [x6] +; CHECK-NEXT: trn1 v7.8h, v3.8h, v4.8h +; CHECK-NEXT: trn2 v3.8h, v3.8h, v4.8h +; CHECK-NEXT: trn1 v4.8h, v0.8h, v6.8h +; CHECK-NEXT: trn1 v17.8h, v2.8h, v16.8h +; CHECK-NEXT: trn2 v0.8h, v0.8h, v6.8h +; CHECK-NEXT: trn2 v2.8h, v2.8h, v16.8h +; CHECK-NEXT: trn1 v18.4s, v5.4s, v4.4s +; CHECK-NEXT: trn1 v19.4s, v7.4s, v17.4s +; CHECK-NEXT: trn1 v20.4s, v1.4s, v0.4s +; CHECK-NEXT: trn2 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: trn1 v21.4s, v3.4s, v2.4s +; CHECK-NEXT: trn2 v4.4s, v5.4s, v4.4s +; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x0] +; CHECK-NEXT: trn2 v1.4s, v3.4s, v2.4s +; CHECK-NEXT: ext v2.16b, v18.16b, v18.16b, #8 +; CHECK-NEXT: trn2 v5.4s, v7.4s, v17.4s +; CHECK-NEXT: st2 { v20.2s, v21.2s }, [x1] +; CHECK-NEXT: ext v3.16b, v19.16b, v19.16b, #8 +; CHECK-NEXT: ext v6.16b, v20.16b, v20.16b, #8 +; CHECK-NEXT: ext v7.16b, v21.16b, v21.16b, #8 +; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x2] +; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x3] +; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x4] +; CHECK-NEXT: ext v2.16b, v4.16b, v4.16b, #8 +; CHECK-NEXT: ext v3.16b, v5.16b, v5.16b, #8 +; CHECK-NEXT: st2 { v6.2s, v7.2s }, [x5] +; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x6] +; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x7] +; CHECK-NEXT: ret + %9 = load <8 x i16>, ptr %0, align 16 + %10 = load <8 x i16>, ptr %1, align 16 + %11 = shufflevector <8 x i16> %9, <8 x i16> %10, <8 x i32> + %12 = shufflevector <8 x i16> %9, <8 x i16> %10, <8 x i32> + %13 = load <8 x i16>, ptr %2, align 16 + %14 = load <8 x i16>, ptr %3, align 16 + %15 = shufflevector <8 x i16> %13, <8 x i16> %14, <8 x i32> + %16 = shufflevector <8 x i16> %13, <8 x i16> %14, <8 x i32> + %17 = load <8 x i16>, ptr %4, align 16 + %18 = load <8 x i16>, ptr %5, align 16 + %19 = shufflevector <8 x i16> %17, <8 x i16> %18, <8 x i32> + %20 = shufflevector <8 x i16> %17, <8 x i16> %18, <8 x i32> + %21 = load <8 x i16>, ptr %6, align 16 + %22 = load <8 x i16>, ptr %7, align 16 + %23 = shufflevector <8 x i16> %21, <8 x i16> %22, <8 x i32> + %24 = shufflevector <8 x i16> %21, <8 x i16> %22, <8 x i32> + %25 = bitcast <8 x i16> %11 to <4 x i32> + %26 = bitcast <8 x i16> %19 to <4 x i32> + %27 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> + %28 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> + %29 = bitcast <8 x i16> %12 to <4 x i32> + %30 = bitcast <8 x i16> %20 to <4 x i32> + %31 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> + %32 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> + %33 = bitcast <8 x i16> %15 to <4 x i32> + %34 = bitcast <8 x i16> %23 to <4 x i32> + %35 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> + %36 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> + %37 = bitcast <8 x i16> %16 to <4 x i32> + %38 = bitcast <8 x i16> %24 to <4 x i32> + %39 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> + %40 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> + %41 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> + %42 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> + %43 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> + %44 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> + %45 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> + %46 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> + %47 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> + %48 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> + store <4 x i32> %41, ptr %0, align 16 + store <4 x i32> %43, ptr %1, align 16 + store <4 x i32> %45, ptr %2, align 16 + store <4 x i32> %47, ptr %3, align 16 + store <4 x i32> %42, ptr %4, align 16 + store <4 x i32> %44, ptr %5, align 16 + store <4 x i32> %46, ptr %6, align 16 + store <4 x i32> %48, ptr %7, align 16 + ret void +} + +define void @transpose_s16_8x8_(ptr nocapture noundef %0) { +; CHECK-LABEL: transpose_s16_8x8_: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: mov x10, x0 +; CHECK-NEXT: mov x11, x0 +; CHECK-NEXT: mov x12, x0 +; CHECK-NEXT: mov x13, x0 +; CHECK-NEXT: mov x14, x0 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x8, #16]! +; CHECK-NEXT: ldr q2, [x9, #32]! +; CHECK-NEXT: ldr q3, [x10, #48]! +; CHECK-NEXT: ldr q4, [x11, #64]! +; CHECK-NEXT: ldr q6, [x12, #80]! +; CHECK-NEXT: ldr q7, [x13, #96]! +; CHECK-NEXT: ldr q16, [x14, #112]! +; CHECK-NEXT: trn1 v5.8h, v0.8h, v1.8h +; CHECK-NEXT: trn2 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: trn1 v1.8h, v2.8h, v3.8h +; CHECK-NEXT: trn2 v2.8h, v2.8h, v3.8h +; CHECK-NEXT: trn1 v3.8h, v4.8h, v6.8h +; CHECK-NEXT: trn2 v4.8h, v4.8h, v6.8h +; CHECK-NEXT: trn1 v17.8h, v7.8h, v16.8h +; CHECK-NEXT: trn2 v6.8h, v7.8h, v16.8h +; CHECK-NEXT: trn1 v18.4s, v5.4s, v3.4s +; CHECK-NEXT: trn1 v20.4s, v0.4s, v4.4s +; CHECK-NEXT: trn1 v19.4s, v1.4s, v17.4s +; CHECK-NEXT: trn1 v21.4s, v2.4s, v6.4s +; CHECK-NEXT: trn2 v22.4s, v5.4s, v3.4s +; CHECK-NEXT: trn2 v23.4s, v1.4s, v17.4s +; CHECK-NEXT: trn2 v0.4s, v0.4s, v4.4s +; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x0] +; CHECK-NEXT: trn2 v1.4s, v2.4s, v6.4s +; CHECK-NEXT: ext v2.16b, v18.16b, v18.16b, #8 +; CHECK-NEXT: st2 { v20.2s, v21.2s }, [x8] +; CHECK-NEXT: ext v4.16b, v20.16b, v20.16b, #8 +; CHECK-NEXT: ext v3.16b, v19.16b, v19.16b, #8 +; CHECK-NEXT: st2 { v22.2s, v23.2s }, [x9] +; CHECK-NEXT: ext v5.16b, v21.16b, v21.16b, #8 +; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x10] +; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x11] +; CHECK-NEXT: ext v2.16b, v22.16b, v22.16b, #8 +; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x12] +; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v3.16b, v23.16b, v23.16b, #8 +; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x13] +; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x14] +; CHECK-NEXT: ret + %2 = load <8 x i16>, ptr %0, align 16 + %3 = getelementptr inbounds <8 x i16>, ptr %0, i64 1 + %4 = load <8 x i16>, ptr %3, align 1 + %5 = shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> + %6 = shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> + %7 = getelementptr inbounds <8 x i16>, ptr %0, i64 2 + %8 = load <8 x i16>, ptr %7, align 16 + %9 = getelementptr inbounds <8 x i16>, ptr %0, i64 3 + %10 = load <8 x i16>, ptr %9, align 16 + %11 = shufflevector <8 x i16> %8, <8 x i16> %10, <8 x i32> + %12 = shufflevector <8 x i16> %8, <8 x i16> %10, <8 x i32> + %13 = getelementptr inbounds <8 x i16>, ptr %0, i64 4 + %14 = load <8 x i16>, ptr %13, align 16 + %15 = getelementptr inbounds <8 x i16>, ptr %0, i64 5 + %16 = load <8 x i16>, ptr %15, align 16 + %17 = shufflevector <8 x i16> %14, <8 x i16> %16, <8 x i32> + %18 = shufflevector <8 x i16> %14, <8 x i16> %16, <8 x i32> + %19 = getelementptr inbounds <8 x i16>, ptr %0, i64 6 + %20 = load <8 x i16>, ptr %19, align 16 + %21 = getelementptr inbounds <8 x i16>, ptr %0, i64 7 + %22 = load <8 x i16>, ptr %21, align 16 + %23 = shufflevector <8 x i16> %20, <8 x i16> %22, <8 x i32> + %24 = shufflevector <8 x i16> %20, <8 x i16> %22, <8 x i32> + %25 = bitcast <8 x i16> %5 to <4 x i32> + %26 = bitcast <8 x i16> %17 to <4 x i32> + %27 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> + %28 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> + %29 = bitcast <8 x i16> %6 to <4 x i32> + %30 = bitcast <8 x i16> %18 to <4 x i32> + %31 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> + %32 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> + %33 = bitcast <8 x i16> %11 to <4 x i32> + %34 = bitcast <8 x i16> %23 to <4 x i32> + %35 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> + %36 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> + %37 = bitcast <8 x i16> %12 to <4 x i32> + %38 = bitcast <8 x i16> %24 to <4 x i32> + %39 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> + %40 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> + %41 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> + %42 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> + %43 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> + %44 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> + %45 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> + %46 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> + %47 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> + %48 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> + store <4 x i32> %41, ptr %0, align 16 + store <4 x i32> %43, ptr %3, align 16 + store <4 x i32> %45, ptr %7, align 16 + store <4 x i32> %47, ptr %9, align 16 + store <4 x i32> %42, ptr %13, align 16 + store <4 x i32> %44, ptr %15, align 16 + store <4 x i32> %46, ptr %19, align 16 + store <4 x i32> %48, ptr %21, align 16 + ret void +} + +define void @store_factor2(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: store_factor2: +; CHECK: // %bb.0: +; CHECK-NEXT: trn1 v2.4s, v0.4s, v1.4s +; CHECK-NEXT: trn1 v3.4s, v1.4s, v0.4s +; CHECK-NEXT: st2 { v2.4s, v3.4s }, [x0] +; CHECK-NEXT: ret + %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> + %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> + %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + store <8 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_factor2_high(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: store_factor2_high: +; CHECK: // %bb.0: +; CHECK-NEXT: trn1 v2.4s, v0.4s, v1.4s +; CHECK-NEXT: trn1 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: zip1 v1.4s, v2.4s, v0.4s +; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: trn1 v1.4s, v1.4s, v0.4s +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x1] +; CHECK-NEXT: ret + %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> + %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> + %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> + %interleaved.vec2 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> + store <4 x i32> %interleaved.vec, ptr %ptr, align 4 + store <4 x i32> %interleaved.vec2, ptr %ptr2, align 4 + ret void +} + +define void @store_factor2_high2(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: store_factor2_high2: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: trn1 v0.4s, v2.4s, v1.4s +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: st2 { v3.2s, v4.2s }, [x1] +; CHECK-NEXT: ret + %interleaved.vec = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> + %interleaved.vec2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> + store <4 x i32> %interleaved.vec, ptr %ptr, align 4 + store <4 x i32> %interleaved.vec2, ptr %ptr2, align 4 + ret void +} + +define void @store_factor3(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: store_factor3: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v3.16b, v0.16b, v1.16b, #12 +; CHECK-NEXT: ext v6.16b, v1.16b, v2.16b, #12 +; CHECK-NEXT: zip2 v3.4s, v0.4s, v3.4s +; CHECK-NEXT: mov v3.s[0], v0.s[0] +; CHECK-NEXT: zip2 v4.4s, v1.4s, v6.4s +; CHECK-NEXT: ext v0.16b, v2.16b, v0.16b, #12 +; CHECK-NEXT: mov v4.s[0], v1.s[0] +; CHECK-NEXT: zip2 v5.4s, v2.4s, v0.4s +; CHECK-NEXT: mov v5.s[0], v2.s[0] +; CHECK-NEXT: st3 { v3.4s, v4.4s, v5.4s }, [x0] +; CHECK-NEXT: ret + %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> + %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> + %v2 = shufflevector <4 x i32> %a2, <4 x i32> %a0, <4 x i32> + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> + store <12 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_factor4(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) { +; CHECK-LABEL: store_factor4: +; CHECK: // %bb.0: +; CHECK-NEXT: trn1 v4.4s, v0.4s, v1.4s +; CHECK-NEXT: trn1 v5.4s, v1.4s, v2.4s +; CHECK-NEXT: trn1 v6.4s, v2.4s, v3.4s +; CHECK-NEXT: trn1 v7.4s, v3.4s, v0.4s +; CHECK-NEXT: st4 { v4.4s, v5.4s, v6.4s, v7.4s }, [x0] +; CHECK-NEXT: ret + %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> + %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> + %v2 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <4 x i32> + %v3 = shufflevector <4 x i32> %a3, <4 x i32> %a0, <4 x i32> + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> + store <16 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} -- 2.7.4