From 0ad9f3e93bbec3926ce61e280a458b3d72e93c2c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 25 Aug 2016 12:45:16 +0000 Subject: [PATCH] [X86][AVX] Provide SubVectorBroadcast fallback if load fold fails (PR29133) Fix for PR29133, matching the approach that was taken for AVX1 scalar broadcasts. llvm-svn: 279735 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +- llvm/lib/Target/X86/X86InstrAVX512.td | 33 + llvm/lib/Target/X86/X86InstrSSE.td | 45 + llvm/test/CodeGen/X86/subvector-broadcast.ll | 1223 ++++++++++++++++++++++++++ 4 files changed, 1302 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/X86/subvector-broadcast.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a32c1526..629226b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12987,8 +12987,7 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, // lower to a VBROADCASTF128/VBROADCASTI128/etc. if (auto *Ld = dyn_cast(peekThroughOneUseBitcasts(SubVec2))) { if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && - areOnlyUsersOf(SubVec2.getNode(), {Op, Vec}) && - !Ld->hasAnyUseOfValue(1)) { + areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) { return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); } } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 6e072c4..b54d0d8 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1026,6 +1026,21 @@ def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), (VBROADCASTI32X4Z256rm addr:$src)>; def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), (VBROADCASTI32X4Z256rm addr:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128X:$src))), + (VINSERTF32x4Z256rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v4f32 VR128X:$src), 1)>; +def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v4i32 VR128X:$src), 1)>; +def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v8i16 VR128X:$src), 1)>; +def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v16i8 VR128X:$src), 1)>; } let Predicates = [HasVLX, HasDQI] in { @@ -1042,6 +1057,15 @@ def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), (VBROADCASTF32X4Z256rm addr:$src)>; def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), (VBROADCASTI32X4Z256rm addr:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), + (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2f64 VR128X:$src), 1)>; +def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2i64 VR128X:$src), 1)>; } let Predicates = [HasDQI] in { @@ -1057,6 +1081,15 @@ defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2", defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8", v16f32_info, v8f32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), + (VINSERTI64x2Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2f64 VR128X:$src), 1)>; +def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), + (VINSERTI64x2Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2i64 VR128X:$src), 1)>; } multiclass avx512_common_broadcast_32x2 opc, string OpcodeStr, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index ed754e5..f652803 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -8662,6 +8662,51 @@ let Predicates = [HasAVX2] in { defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; } + +//===----------------------------------------------------------------------===// +// SubVector Broadcasts +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. + +let Predicates = [HasAVX2, NoVLX] in { +def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), + (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v2i64 VR128:$src), 1)>; +def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), + (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v4i32 VR128:$src), 1)>; +def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), + (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v8i16 VR128:$src), 1)>; +def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), + (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v16i8 VR128:$src), 1)>; +} + +let Predicates = [HasAVX, NoVLX] in { +def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v2f64 VR128:$src), 1)>; +def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v4f32 VR128:$src), 1)>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v2i64 VR128:$src), 1)>; +def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v4i32 VR128:$src), 1)>; +def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v8i16 VR128:$src), 1)>; +def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v16i8 VR128:$src), 1)>; +} + //===----------------------------------------------------------------------===// // Variable Bit Shifts // diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll new file mode 100644 index 0000000..01c63dc --- /dev/null +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -0,0 +1,1223 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX1 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX2 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512BW +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512DQ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512BW +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512DQ + +; +; Subvector Load + Broadcast +; + +define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind { +; X32-AVX-LABEL: test_broadcast_2f64_4f64: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_2f64_4f64: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: vbroadcastf32x4 (%eax), %ymm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_2f64_4f64: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: vbroadcastf64x2 (%eax), %ymm0 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_2f64_4f64: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_2f64_4f64: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vbroadcastf32x4 (%rdi), %ymm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_2f64_4f64: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vbroadcastf64x2 (%rdi), %ymm0 +; X64-AVX512DQ-NEXT: retq + %1 = load <2 x double>, <2 x double> *%p + %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> + ret <4 x double> %2 +} + +define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { +; X32-AVX-LABEL: test_broadcast_2f64_8f64: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX-NEXT: vmovdqa %ymm0, %ymm1 +; X32-AVX-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_2f64_8f64: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: vbroadcastf32x4 (%eax), %ymm0 +; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_2f64_8f64: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: vbroadcastf64x2 (%eax), %ymm0 +; X32-AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_2f64_8f64: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_2f64_8f64: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vbroadcastf32x4 (%rdi), %ymm0 +; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_2f64_8f64: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vbroadcastf64x2 (%rdi), %ymm0 +; X64-AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQ-NEXT: retq + %1 = load <2 x double>, <2 x double> *%p + %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> + ret <8 x double> %2 +} + +define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind { +; X32-AVX-LABEL: test_broadcast_4f64_8f64: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 +; X32-AVX-NEXT: retl +; +; X32-AVX512-LABEL: test_broadcast_4f64_8f64: +; X32-AVX512: ## BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vmovapd (%eax), %ymm0 +; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_4f64_8f64: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_4f64_8f64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vmovapd (%rdi), %ymm0 +; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: retq + %1 = load <4 x double>, <4 x double> *%p + %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> + ret <8 x double> %2 +} + +define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind { +; X32-AVX1-LABEL: test_broadcast_2i64_4i64: +; X32-AVX1: ## BB#0: +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: test_broadcast_2i64_4i64: +; X32-AVX2: ## BB#0: +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX2-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_2i64_4i64: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: vbroadcasti32x4 (%eax), %ymm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_2i64_4i64: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: vbroadcasti64x2 (%eax), %ymm0 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_2i64_4i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_2i64_4i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_2i64_4i64: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vbroadcasti32x4 (%rdi), %ymm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_2i64_4i64: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vbroadcasti64x2 (%rdi), %ymm0 +; X64-AVX512DQ-NEXT: retq + %1 = load <2 x i64>, <2 x i64> *%p + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> + ret <4 x i64> %2 +} + +define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind { +; X32-AVX1-LABEL: test_broadcast_2i64_8i64: +; X32-AVX1: ## BB#0: +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX1-NEXT: vmovdqa %ymm0, %ymm1 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: test_broadcast_2i64_8i64: +; X32-AVX2: ## BB#0: +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X32-AVX2-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_2i64_8i64: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: vbroadcasti32x4 (%eax), %ymm0 +; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_2i64_8i64: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: vbroadcasti64x2 (%eax), %ymm0 +; X32-AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_2i64_8i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_2i64_8i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_2i64_8i64: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vbroadcasti32x4 (%rdi), %ymm0 +; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_2i64_8i64: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vbroadcasti64x2 (%rdi), %ymm0 +; X64-AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQ-NEXT: retq + %1 = load <2 x i64>, <2 x i64> *%p + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> + ret <8 x i64> %2 +} + +define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind { +; X32-AVX-LABEL: test_broadcast_4i64_8i64: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 +; X32-AVX-NEXT: retl +; +; X32-AVX512-LABEL: test_broadcast_4i64_8i64: +; X32-AVX512: ## BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vmovdqa64 (%eax), %ymm0 +; X32-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_4i64_8i64: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_4i64_8i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vmovdqa64 (%rdi), %ymm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: retq + %1 = load <4 x i64>, <4 x i64> *%p + %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> + ret <8 x i64> %2 +} + +define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind { +; X32-AVX-LABEL: test_broadcast_4f32_8f32: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX-NEXT: retl +; +; X32-AVX512-LABEL: test_broadcast_4f32_8f32: +; X32-AVX512: ## BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcastf32x4 (%eax), %ymm0 +; X32-AVX512-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_4f32_8f32: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_4f32_8f32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcastf32x4 (%rdi), %ymm0 +; X64-AVX512-NEXT: retq + %1 = load <4 x float>, <4 x float> *%p + %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> + ret <8 x float> %2 +} + +define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind { +; X32-AVX-LABEL: test_broadcast_4f32_16f32: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX-NEXT: vmovdqa %ymm0, %ymm1 +; X32-AVX-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_4f32_16f32: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: vbroadcastf32x4 (%eax), %ymm0 +; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_4f32_16f32: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: vbroadcastf32x4 (%eax), %ymm0 +; X32-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_4f32_16f32: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_4f32_16f32: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vbroadcastf32x4 (%rdi), %ymm0 +; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_4f32_16f32: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vbroadcastf32x4 (%rdi), %ymm0 +; X64-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQ-NEXT: retq + %1 = load <4 x float>, <4 x float> *%p + %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> + ret <16 x float> %2 +} + +define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { +; X32-AVX-LABEL: test_broadcast_8f32_16f32: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 +; X32-AVX-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_8f32_16f32: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: vmovapd (%eax), %ymm0 +; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_8f32_16f32: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_8f32_16f32: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_8f32_16f32: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vmovapd (%rdi), %ymm0 +; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_8f32_16f32: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQ-NEXT: retq + %1 = load <8 x float>, <8 x float> *%p + %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> + ret <16 x float> %2 +} + +define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind { +; X32-AVX1-LABEL: test_broadcast_4i32_8i32: +; X32-AVX1: ## BB#0: +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: test_broadcast_4i32_8i32: +; X32-AVX2: ## BB#0: +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX2-NEXT: retl +; +; X32-AVX512-LABEL: test_broadcast_4i32_8i32: +; X32-AVX512: ## BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcasti32x4 (%eax), %ymm0 +; X32-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_4i32_8i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_4i32_8i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_4i32_8i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcasti32x4 (%rdi), %ymm0 +; X64-AVX512-NEXT: retq + %1 = load <4 x i32>, <4 x i32> *%p + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> + ret <8 x i32> %2 +} + +define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind { +; X32-AVX1-LABEL: test_broadcast_4i32_16i32: +; X32-AVX1: ## BB#0: +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX1-NEXT: vmovdqa %ymm0, %ymm1 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: test_broadcast_4i32_16i32: +; X32-AVX2: ## BB#0: +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X32-AVX2-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_4i32_16i32: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: vbroadcasti32x4 (%eax), %ymm0 +; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_4i32_16i32: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: vbroadcasti32x4 (%eax), %ymm0 +; X32-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_4i32_16i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_4i32_16i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_4i32_16i32: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vbroadcasti32x4 (%rdi), %ymm0 +; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_4i32_16i32: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vbroadcasti32x4 (%rdi), %ymm0 +; X64-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQ-NEXT: retq + %1 = load <4 x i32>, <4 x i32> *%p + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> + ret <16 x i32> %2 +} + +define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { +; X32-AVX-LABEL: test_broadcast_8i32_16i32: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 +; X32-AVX-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_8i32_16i32: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: vmovdqa32 (%eax), %ymm0 +; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_8i32_16i32: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: vmovdqa32 (%eax), %ymm0 +; X32-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_8i32_16i32: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_8i32_16i32: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vmovdqa32 (%rdi), %ymm0 +; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_8i32_16i32: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vmovdqa32 (%rdi), %ymm0 +; X64-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQ-NEXT: retq + %1 = load <8 x i32>, <8 x i32> *%p + %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> + ret <16 x i32> %2 +} + +define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind { +; X32-AVX1-LABEL: test_broadcast_8i16_16i16: +; X32-AVX1: ## BB#0: +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: test_broadcast_8i16_16i16: +; X32-AVX2: ## BB#0: +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX2-NEXT: retl +; +; X32-AVX512-LABEL: test_broadcast_8i16_16i16: +; X32-AVX512: ## BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcasti32x4 (%eax), %ymm0 +; X32-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_8i16_16i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_8i16_16i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_8i16_16i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcasti32x4 (%rdi), %ymm0 +; X64-AVX512-NEXT: retq + %1 = load <8 x i16>, <8 x i16> *%p + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> + ret <16 x i16> %2 +} + +define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind { +; X32-AVX1-LABEL: test_broadcast_8i16_32i16: +; X32-AVX1: ## BB#0: +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX1-NEXT: vmovdqa %ymm0, %ymm1 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: test_broadcast_8i16_32i16: +; X32-AVX2: ## BB#0: +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X32-AVX2-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_8i16_32i16: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: vbroadcasti32x4 (%eax), %ymm0 +; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_8i16_32i16: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: vbroadcasti32x4 (%eax), %ymm0 +; X32-AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm1 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_8i16_32i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_8i16_32i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_8i16_32i16: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vbroadcasti32x4 (%rdi), %ymm0 +; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_8i16_32i16: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vbroadcasti32x4 (%rdi), %ymm0 +; X64-AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm1 +; X64-AVX512DQ-NEXT: retq + %1 = load <8 x i16>, <8 x i16> *%p + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> + ret <32 x i16> %2 +} + +define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { +; X32-AVX-LABEL: test_broadcast_16i16_32i16: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 +; X32-AVX-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_16i16_32i16: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: vmovdqu16 (%eax), %ymm0 +; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_16i16_32i16: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_16i16_32i16: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_16i16_32i16: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vmovdqu16 (%rdi), %ymm0 +; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_16i16_32i16: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX512DQ-NEXT: retq + %1 = load <16 x i16>, <16 x i16> *%p + %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> + ret <32 x i16> %2 +} + +define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind { +; X32-AVX1-LABEL: test_broadcast_16i8_32i8: +; X32-AVX1: ## BB#0: +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: test_broadcast_16i8_32i8: +; X32-AVX2: ## BB#0: +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX2-NEXT: retl +; +; X32-AVX512-LABEL: test_broadcast_16i8_32i8: +; X32-AVX512: ## BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcasti32x4 (%eax), %ymm0 +; X32-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_16i8_32i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_16i8_32i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_16i8_32i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcasti32x4 (%rdi), %ymm0 +; X64-AVX512-NEXT: retq + %1 = load <16 x i8>, <16 x i8> *%p + %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> + ret <32 x i8> %2 +} + +define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind { +; X32-AVX1-LABEL: test_broadcast_16i8_64i8: +; X32-AVX1: ## BB#0: +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX1-NEXT: vmovdqa %ymm0, %ymm1 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: test_broadcast_16i8_64i8: +; X32-AVX2: ## BB#0: +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X32-AVX2-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_16i8_64i8: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: vbroadcasti32x4 (%eax), %ymm0 +; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_16i8_64i8: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: vbroadcasti32x4 (%eax), %ymm0 +; X32-AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm1 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_16i8_64i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_16i8_64i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_16i8_64i8: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vbroadcasti32x4 (%rdi), %ymm0 +; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_16i8_64i8: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vbroadcasti32x4 (%rdi), %ymm0 +; X64-AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm1 +; X64-AVX512DQ-NEXT: retq + %1 = load <16 x i8>, <16 x i8> *%p + %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> + ret <64 x i8> %2 +} + +define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { +; X32-AVX-LABEL: test_broadcast_32i8_64i8: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 +; X32-AVX-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_32i8_64i8: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: vmovdqu8 (%eax), %ymm0 +; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_32i8_64i8: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_32i8_64i8: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_32i8_64i8: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vmovdqu8 (%rdi), %ymm0 +; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_32i8_64i8: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX512DQ-NEXT: retq + %1 = load <32 x i8>, <32 x i8> *%p + %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> + ret <64 x i8> %2 +} + +; +; Subvector Load + Broadcast + Store +; + +define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) { +; X32-AVX-LABEL: test_broadcast_2f64_4f64_reuse: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 +; X32-AVX-NEXT: vmovaps %xmm0, (%eax) +; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512BW-NEXT: vmovaps (%ecx), %xmm0 +; X32-AVX512BW-NEXT: vmovaps %xmm0, (%eax) +; X32-AVX512BW-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512DQ-NEXT: vmovapd (%ecx), %xmm0 +; X32-AVX512DQ-NEXT: vmovapd %xmm0, (%eax) +; X32-AVX512DQ-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX512BW-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX512BW-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vmovapd (%rdi), %xmm0 +; X64-AVX512DQ-NEXT: vmovapd %xmm0, (%rsi) +; X64-AVX512DQ-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: retq + %1 = load <2 x double>, <2 x double>* %p0 + store <2 x double> %1, <2 x double>* %p1 + %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> + ret <4 x double> %2 +} + +define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) { +; X32-AVX-LABEL: test_broadcast_2i64_4i64_reuse: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 +; X32-AVX-NEXT: vmovaps %xmm0, (%eax) +; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_2i64_4i64_reuse: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512BW-NEXT: vmovdqa64 (%ecx), %xmm0 +; X32-AVX512BW-NEXT: vmovdqa64 %xmm0, (%eax) +; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_2i64_4i64_reuse: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512DQ-NEXT: vmovdqa64 (%ecx), %xmm0 +; X32-AVX512DQ-NEXT: vmovdqa64 %xmm0, (%eax) +; X32-AVX512DQ-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_2i64_4i64_reuse: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm0 +; X64-AVX512BW-NEXT: vmovdqa64 %xmm0, (%rsi) +; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_2i64_4i64_reuse: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vmovdqa64 (%rdi), %xmm0 +; X64-AVX512DQ-NEXT: vmovdqa64 %xmm0, (%rsi) +; X64-AVX512DQ-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: retq + %1 = load <2 x i64>, <2 x i64>* %p0 + store <2 x i64> %1, <2 x i64>* %p1 + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> + ret <4 x i64> %2 +} + +define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) { +; X32-AVX-LABEL: test_broadcast_4f32_8f32_reuse: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 +; X32-AVX-NEXT: vmovaps %xmm0, (%eax) +; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX-NEXT: retl +; +; X32-AVX512-LABEL: test_broadcast_4f32_8f32_reuse: +; X32-AVX512: ## BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512-NEXT: vmovaps (%ecx), %xmm0 +; X32-AVX512-NEXT: vmovaps %xmm0, (%eax) +; X32-AVX512-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_4f32_8f32_reuse: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_4f32_8f32_reuse: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX512-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX512-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq + %1 = load <4 x float>, <4 x float>* %p0 + store <4 x float> %1, <4 x float>* %p1 + %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> + ret <8 x float> %2 +} + +define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) { +; X32-AVX-LABEL: test_broadcast_4i32_8i32_reuse: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 +; X32-AVX-NEXT: vmovaps %xmm0, (%eax) +; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX-NEXT: retl +; +; X32-AVX512-LABEL: test_broadcast_4i32_8i32_reuse: +; X32-AVX512: ## BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512-NEXT: vmovdqa32 (%ecx), %xmm0 +; X32-AVX512-NEXT: vmovdqa32 %xmm0, (%eax) +; X32-AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_4i32_8i32_reuse: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vmovdqa32 (%rdi), %xmm0 +; X64-AVX512-NEXT: vmovdqa32 %xmm0, (%rsi) +; X64-AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq + %1 = load <4 x i32>, <4 x i32>* %p0 + store <4 x i32> %1, <4 x i32>* %p1 + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> + ret <8 x i32> %2 +} + +define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind { +; X32-AVX-LABEL: test_broadcast_8i16_16i16_reuse: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 +; X32-AVX-NEXT: vmovaps %xmm0, (%eax) +; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512BW-NEXT: vmovdqu16 (%ecx), %xmm0 +; X32-AVX512BW-NEXT: vmovdqu16 %xmm0, (%eax) +; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512DQ-NEXT: vmovdqa64 (%ecx), %xmm0 +; X32-AVX512DQ-NEXT: vmovdqa32 %xmm0, (%eax) +; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vmovdqu16 (%rdi), %xmm0 +; X64-AVX512BW-NEXT: vmovdqu16 %xmm0, (%rsi) +; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vmovdqa64 (%rdi), %xmm0 +; X64-AVX512DQ-NEXT: vmovdqa32 %xmm0, (%rsi) +; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: retq + %1 = load <8 x i16>, <8 x i16> *%p0 + store <8 x i16> %1, <8 x i16>* %p1 + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> + ret <16 x i16> %2 +} + +define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind { +; X32-AVX-LABEL: test_broadcast_16i8_32i8_reuse: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 +; X32-AVX-NEXT: vmovaps %xmm0, (%eax) +; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512BW-NEXT: vmovdqu8 (%ecx), %xmm0 +; X32-AVX512BW-NEXT: vmovdqu8 %xmm0, (%eax) +; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512DQ-NEXT: vmovdqa64 (%ecx), %xmm0 +; X32-AVX512DQ-NEXT: vmovdqa32 %xmm0, (%eax) +; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vmovdqu8 (%rdi), %xmm0 +; X64-AVX512BW-NEXT: vmovdqu8 %xmm0, (%rsi) +; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vmovdqa64 (%rdi), %xmm0 +; X64-AVX512DQ-NEXT: vmovdqa32 %xmm0, (%rsi) +; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: retq + %1 = load <16 x i8>, <16 x i8> *%p0 + store <16 x i8> %1, <16 x i8>* %p1 + %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> + ret <32 x i8> %2 +} + +; +; Subvector Load + Broadcast with Separate Store +; + +define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) { +; X32-AVX-LABEL: test_broadcast_4i32_8i32_chain: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 +; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-AVX-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512BW-NEXT: vmovdqa64 (%ecx), %xmm0 +; X32-AVX512BW-NEXT: vpxord %xmm1, %xmm1, %xmm1 +; X32-AVX512BW-NEXT: vmovdqa32 %xmm1, (%eax) +; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512DQ-NEXT: vmovdqa64 (%ecx), %xmm0 +; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm0 +; X64-AVX512BW-NEXT: vpxord %xmm1, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: vmovdqa32 %xmm1, (%rsi) +; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vmovdqa64 (%rdi), %xmm0 +; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: retq + %1 = load <4 x i32>, <4 x i32>* %p0 + store <4 x float> zeroinitializer, <4 x float>* %p1 + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> + ret <8 x i32> %2 +} + +define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) { +; X32-AVX-LABEL: test_broadcast_4i32_16i32_chain: +; X32-AVX: ## BB#0: +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 +; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-AVX-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 +; X32-AVX-NEXT: retl +; +; X32-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain: +; X32-AVX512BW: ## BB#0: +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512BW-NEXT: vmovdqa64 (%ecx), %xmm0 +; X32-AVX512BW-NEXT: vpxord %xmm1, %xmm1, %xmm1 +; X32-AVX512BW-NEXT: vmovdqa32 %xmm1, (%eax) +; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: retl +; +; X32-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain: +; X32-AVX512DQ: ## BB#0: +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512DQ-NEXT: vmovdqa64 (%ecx), %xmm0 +; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512DQ-NEXT: retl +; +; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX-NEXT: retq +; +; X64-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain: +; X64-AVX512BW: ## BB#0: +; X64-AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm0 +; X64-AVX512BW-NEXT: vpxord %xmm1, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: vmovdqa32 %xmm1, (%rsi) +; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: retq +; +; X64-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain: +; X64-AVX512DQ: ## BB#0: +; X64-AVX512DQ-NEXT: vmovdqa64 (%rdi), %xmm0 +; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQ-NEXT: retq + %1 = load <4 x i32>, <4 x i32>* %p0 + store <4 x float> zeroinitializer, <4 x float>* %p1 + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> + ret <16 x i32> %2 +} -- 2.7.4