From 460cbabe170e481d4468ef77de1fb08b02a0f2a3 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 9 Jan 2020 13:06:54 -0500 Subject: [PATCH] [x86] add tests for 2-way splat copy; NFC Based on code in PR42024: https://bugs.llvm.org/show_bug.cgi?id=42024 --- llvm/test/CodeGen/X86/vector-interleave.ll | 190 +++++++++++++++++++++++++++++ 1 file changed, 190 insertions(+) diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll index 0c71d12..635d189 100644 --- a/llvm/test/CodeGen/X86/vector-interleave.ll +++ b/llvm/test/CodeGen/X86/vector-interleave.ll @@ -137,3 +137,193 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x %result = shufflevector <32 x i16> %abcd, <32 x i16> %efgh, <64 x i32> ret <64 x i16> %result } + +define void @splat2_i8(<32 x i8>* %s, <64 x i8>* %d) { +; SSE-LABEL: splat2_i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqu (%rdi), %xmm0 +; SSE-NEXT: movdqu 16(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqu %xmm1, 48(%rsi) +; SSE-NEXT: movdqu %xmm3, 32(%rsi) +; SSE-NEXT: movdqu %xmm0, 16(%rsi) +; SSE-NEXT: movdqu %xmm2, (%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: splat2_i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vmovdqu %xmm1, 48(%rsi) +; AVX1-NEXT: vmovdqu %xmm3, 32(%rsi) +; AVX1-NEXT: vmovdqu %xmm0, 16(%rsi) +; AVX1-NEXT: vmovdqu %xmm2, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat2_i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %ld32 = load <32 x i8>, <32 x i8>* %s, align 1 + %cat = shufflevector <32 x i8> %ld32, <32 x i8> undef, <64 x i32> + %cat2 = shufflevector <64 x i8> %cat, <64 x i8> undef, <64 x i32> + store <64 x i8> %cat2, <64 x i8>* %d, align 1 + ret void +} + +define void @splat2_i16(<16 x i16>* %s, <32 x i16>* %d) { +; SSE-LABEL: splat2_i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqu (%rdi), %xmm0 +; SSE-NEXT: movdqu 16(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqu %xmm1, 48(%rsi) +; SSE-NEXT: movdqu %xmm3, 32(%rsi) +; SSE-NEXT: movdqu %xmm0, 16(%rsi) +; SSE-NEXT: movdqu %xmm2, (%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: splat2_i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vmovdqu %xmm1, 48(%rsi) +; AVX1-NEXT: vmovdqu %xmm3, 32(%rsi) +; AVX1-NEXT: vmovdqu %xmm0, 16(%rsi) +; AVX1-NEXT: vmovdqu %xmm2, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat2_i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7,8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %ld32 = load <16 x i16>, <16 x i16>* %s, align 1 + %cat = shufflevector <16 x i16> %ld32, <16 x i16> undef, <32 x i32> + %cat2 = shufflevector <32 x i16> %cat, <32 x i16> undef, <32 x i32> + store <32 x i16> %cat2, <32 x i16>* %d, align 1 + ret void +} + +define void @splat2_i32(<8 x i32>* %s, <16 x i32>* %d) { +; SSE-LABEL: splat2_i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqu (%rdi), %xmm0 +; SSE-NEXT: movdqu 16(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: movdqu %xmm1, 48(%rsi) +; SSE-NEXT: movdqu %xmm3, 32(%rsi) +; SSE-NEXT: movdqu %xmm0, 16(%rsi) +; SSE-NEXT: movdqu %xmm2, (%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: splat2_i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: vmovups 16(%rdi), %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,0,1,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-NEXT: vmovups %xmm1, 48(%rsi) +; AVX1-NEXT: vmovups %xmm3, 32(%rsi) +; AVX1-NEXT: vmovups %xmm0, 16(%rsi) +; AVX1-NEXT: vmovups %xmm2, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat2_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovups (%rdi), %ymm0 +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovups %ymm0, 32(%rsi) +; AVX2-NEXT: vmovups %ymm1, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %ld32 = load <8 x i32>, <8 x i32>* %s, align 1 + %cat = shufflevector <8 x i32> %ld32, <8 x i32> undef, <16 x i32> + %cat2 = shufflevector <16 x i32> %cat, <16 x i32> undef, <16 x i32> + store <16 x i32> %cat2, <16 x i32>* %d, align 1 + ret void +} + +define void @splat2_i64(<4 x i64>* %s, <8 x i64>* %d) { +; SSE-LABEL: splat2_i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqu (%rdi), %xmm0 +; SSE-NEXT: movdqu 16(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: movdqu %xmm1, 48(%rsi) +; SSE-NEXT: movdqu %xmm3, 32(%rsi) +; SSE-NEXT: movdqu %xmm0, 16(%rsi) +; SSE-NEXT: movdqu %xmm2, (%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: splat2_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovupd (%rdi), %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3] +; AVX1-NEXT: vmovupd %ymm0, 32(%rsi) +; AVX1-NEXT: vmovupd %ymm1, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat2_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovups (%rdi), %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vmovups %ymm0, 32(%rsi) +; AVX2-NEXT: vmovups %ymm1, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %ld32 = load <4 x i64>, <4 x i64>* %s, align 1 + %cat = shufflevector <4 x i64> %ld32, <4 x i64> undef, <8 x i32> + %cat2 = shufflevector <8 x i64> %cat, <8 x i64> undef, <8 x i32> + store <8 x i64> %cat2, <8 x i64>* %d, align 1 + ret void +} -- 2.7.4