From 3bd02ec97704058a984b0db65a5d104830d2b1f6 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Thu, 30 Sep 2021 17:30:57 +0300 Subject: [PATCH] [NFC][X86][Codegen] Add test coverage for interleaved i64 load/store stride=2 --- .../X86/vector-interleaved-load-i64-stride-2.ll | 131 +++++++++++++++++++ .../X86/vector-interleaved-store-i64-stride-2.ll | 144 +++++++++++++++++++++ 2 files changed, 275 insertions(+) create mode 100644 llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll create mode 100644 llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll new file mode 100644 index 0000000..4896323 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll @@ -0,0 +1,131 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2 %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2 %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2 %s + +; These patterns are produced by LoopVectorizer for interleaved stores. + +define void @load_i64_stride2_vf2(<4 x i64>* %in.vec, <2 x i64>* %out.vec0, <2 x i64>* %out.vec1) nounwind { +; AVX2-LABEL: load_i64_stride2_vf2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: vmovaps %xmm2, (%rsi) +; AVX2-NEXT: vmovaps %xmm0, (%rdx) +; AVX2-NEXT: retq + %wide.vec = load <4 x i64>, <4 x i64>* %in.vec, align 32 + + %strided.vec0 = shufflevector <4 x i64> %wide.vec, <4 x i64> poison, <2 x i32> + %strided.vec1 = shufflevector <4 x i64> %wide.vec, <4 x i64> poison, <2 x i32> + + store <2 x i64> %strided.vec0, <2 x i64>* %out.vec0, align 32 + store <2 x i64> %strided.vec1, <2 x i64>* %out.vec1, align 32 + + ret void +} + +define void @load_i64_stride2_vf4(<8 x i64>* %in.vec, <4 x i64>* %out.vec0, <4 x i64>* %out.vec1) nounwind { +; AVX2-LABEL: load_i64_stride2_vf4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %wide.vec = load <8 x i64>, <8 x i64>* %in.vec, align 32 + + %strided.vec0 = shufflevector <8 x i64> %wide.vec, <8 x i64> poison, <4 x i32> + %strided.vec1 = shufflevector <8 x i64> %wide.vec, <8 x i64> poison, <4 x i32> + + store <4 x i64> %strided.vec0, <4 x i64>* %out.vec0, align 32 + store <4 x i64> %strided.vec1, <4 x i64>* %out.vec1, align 32 + + ret void +} + +define void @load_i64_stride2_vf8(<16 x i64>* %in.vec, <8 x i64>* %out.vec0, <8 x i64>* %out.vec1) nounwind { +; AVX2-LABEL: load_i64_stride2_vf8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %wide.vec = load <16 x i64>, <16 x i64>* %in.vec, align 32 + + %strided.vec0 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <8 x i32> + %strided.vec1 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <8 x i32> + + store <8 x i64> %strided.vec0, <8 x i64>* %out.vec0, align 32 + store <8 x i64> %strided.vec1, <8 x i64>* %out.vec1, align 32 + + ret void +} + +define void @load_i64_stride2_vf16(<32 x i64>* %in.vec, <16 x i64>* %out.vec0, <16 x i64>* %out.vec1) nounwind { +; AVX2-LABEL: load_i64_stride2_vf16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX2-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,1,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vmovaps %ymm9, 96(%rsi) +; AVX2-NEXT: vmovaps %ymm11, (%rsi) +; AVX2-NEXT: vmovaps %ymm10, 32(%rsi) +; AVX2-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %wide.vec = load <32 x i64>, <32 x i64>* %in.vec, align 32 + + %strided.vec0 = shufflevector <32 x i64> %wide.vec, <32 x i64> poison, <16 x i32> + %strided.vec1 = shufflevector <32 x i64> %wide.vec, <32 x i64> poison, <16 x i32> + + store <16 x i64> %strided.vec0, <16 x i64>* %out.vec0, align 32 + store <16 x i64> %strided.vec1, <16 x i64>* %out.vec1, align 32 + + ret void +} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll new file mode 100644 index 0000000..895c6af --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2 %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2 %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2 %s + +; These patterns are produced by LoopVectorizer for interleaved stores. + +define void @store_i64_stride2_vf2(<2 x i64>* %in.vecptr0, <2 x i64>* %in.vecptr1, <4 x i64>* %out.vec) nounwind { +; AVX2-LABEL: store_i64_stride2_vf2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.vec0 = load <2 x i64>, <2 x i64>* %in.vecptr0, align 32 + %in.vec1 = load <2 x i64>, <2 x i64>* %in.vecptr1, align 32 + + %concat01 = shufflevector <2 x i64> %in.vec0, <2 x i64> %in.vec1, <4 x i32> + %interleaved.vec = shufflevector <4 x i64> %concat01, <4 x i64> poison, <4 x i32> + + store <4 x i64> %interleaved.vec, <4 x i64>* %out.vec, align 32 + + ret void +} + +define void @store_i64_stride2_vf4(<4 x i64>* %in.vecptr0, <4 x i64>* %in.vecptr1, <8 x i64>* %out.vec) nounwind { +; AVX2-LABEL: store_i64_stride2_vf4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps (%rsi), %ymm1 +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.vec0 = load <4 x i64>, <4 x i64>* %in.vecptr0, align 32 + %in.vec1 = load <4 x i64>, <4 x i64>* %in.vecptr1, align 32 + + %concat01 = shufflevector <4 x i64> %in.vec0, <4 x i64> %in.vec1, <8 x i32> + %interleaved.vec = shufflevector <8 x i64> %concat01, <8 x i64> poison, <8 x i32> + + store <8 x i64> %interleaved.vec, <8 x i64>* %out.vec, align 32 + + ret void +} + +define void @store_i64_stride2_vf8(<8 x i64>* %in.vecptr0, <8 x i64>* %in.vecptr1, <16 x i64>* %out.vec) nounwind { +; AVX2-LABEL: store_i64_stride2_vf8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-NEXT: vmovaps (%rsi), %ymm2 +; AVX2-NEXT: vmovaps 32(%rsi), %ymm3 +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,2,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] +; AVX2-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.vec0 = load <8 x i64>, <8 x i64>* %in.vecptr0, align 32 + %in.vec1 = load <8 x i64>, <8 x i64>* %in.vecptr1, align 32 + + %concat01 = shufflevector <8 x i64> %in.vec0, <8 x i64> %in.vec1, <16 x i32> + %interleaved.vec = shufflevector <16 x i64> %concat01, <16 x i64> poison, <16 x i32> + + store <16 x i64> %interleaved.vec, <16 x i64>* %out.vec, align 32 + + ret void +} + +define void @store_i64_stride2_vf16(<16 x i64>* %in.vecptr0, <16 x i64>* %in.vecptr1, <32 x i64>* %out.vec) nounwind { +; AVX2-LABEL: store_i64_stride2_vf16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-NEXT: vmovaps 64(%rsi), %ymm6 +; AVX2-NEXT: vmovaps 96(%rsi), %ymm7 +; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,2,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,2,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5],ymm5[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[0,2,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[0,2,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7] +; AVX2-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX2-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX2-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-NEXT: vmovaps %ymm5, 160(%rdx) +; AVX2-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-NEXT: vmovaps %ymm8, 32(%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.vec0 = load <16 x i64>, <16 x i64>* %in.vecptr0, align 32 + %in.vec1 = load <16 x i64>, <16 x i64>* %in.vecptr1, align 32 + + %concat01 = shufflevector <16 x i64> %in.vec0, <16 x i64> %in.vec1, <32 x i32> + %interleaved.vec = shufflevector <32 x i64> %concat01, <32 x i64> poison, <32 x i32> + + store <32 x i64> %interleaved.vec, <32 x i64>* %out.vec, align 32 + + ret void +} -- 2.7.4