From d6a36619cec44d02a2a3526eceb2ac128d90e030 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 31 May 2023 15:33:44 +0100 Subject: [PATCH] [X86] X86FixupVectorConstantsPass - use VBROADCASTSS/VBROADCASTSD for integer vector loads on AVX1-only targets Matches behaviour in lowerBuildVectorAsBroadcast --- llvm/lib/Target/X86/X86FixupVectorConstants.cpp | 6 +- llvm/test/CodeGen/X86/abdu-vector-128.ll | 15 +- llvm/test/CodeGen/X86/abdu-vector-256.ll | 12 +- .../X86/any_extend_vector_inreg_of_broadcast.ll | 3 +- ...extend_vector_inreg_of_broadcast_from_memory.ll | 3 +- llvm/test/CodeGen/X86/avx-logic.ll | 12 +- llvm/test/CodeGen/X86/avx-shift.ll | 12 +- .../CodeGen/X86/bitcast-int-to-vector-bool-zext.ll | 4 +- .../test/CodeGen/X86/bitcast-int-to-vector-bool.ll | 2 +- .../CodeGen/X86/broadcast-elm-cross-splat-vec.ll | 12 +- llvm/test/CodeGen/X86/combine-add.ll | 2 +- llvm/test/CodeGen/X86/combine-bitreverse.ll | 2 +- llvm/test/CodeGen/X86/combine-bitselect.ll | 7 +- llvm/test/CodeGen/X86/combine-pavg.ll | 2 +- llvm/test/CodeGen/X86/combine-pmuldq.ll | 3 +- llvm/test/CodeGen/X86/combine-sdiv.ll | 15 +- llvm/test/CodeGen/X86/combine-smax.ll | 2 +- llvm/test/CodeGen/X86/combine-smin.ll | 2 +- llvm/test/CodeGen/X86/combine-sub-usat.ll | 2 +- llvm/test/CodeGen/X86/combine-udiv.ll | 2 +- llvm/test/CodeGen/X86/combine-urem.ll | 4 +- llvm/test/CodeGen/X86/concat-cast.ll | 3 +- llvm/test/CodeGen/X86/freeze-vector.ll | 8 +- llvm/test/CodeGen/X86/gfni-funnel-shifts.ll | 14 +- llvm/test/CodeGen/X86/gfni-rotates.ll | 8 +- llvm/test/CodeGen/X86/gfni-shifts.ll | 18 +- llvm/test/CodeGen/X86/horizontal-reduce-umax.ll | 9 +- llvm/test/CodeGen/X86/horizontal-reduce-umin.ll | 9 +- llvm/test/CodeGen/X86/i64-to-float.ll | 6 +- .../CodeGen/X86/insert-into-constant-vector.ll | 3 +- llvm/test/CodeGen/X86/masked_store_trunc.ll | 4 +- llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll | 56 +- llvm/test/CodeGen/X86/masked_store_trunc_usat.ll | 59 +- llvm/test/CodeGen/X86/midpoint-int-vec-128.ll | 13 +- llvm/test/CodeGen/X86/midpoint-int-vec-256.ll | 93 +- llvm/test/CodeGen/X86/paddus.ll | 32 +- llvm/test/CodeGen/X86/pmaddubsw.ll | 3 +- llvm/test/CodeGen/X86/pr31773.ll | 4 +- llvm/test/CodeGen/X86/psubus.ll | 56 +- llvm/test/CodeGen/X86/sadd_sat_vec.ll | 4 +- llvm/test/CodeGen/X86/shrink_vmul.ll | 4 +- .../CodeGen/X86/shuffle-strided-with-offset-256.ll | 23 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll | 16 +- llvm/test/CodeGen/X86/splat-for-size.ll | 8 +- llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll | 4 +- llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll | 4 +- llvm/test/CodeGen/X86/ssub_sat_vec.ll | 4 +- llvm/test/CodeGen/X86/uadd_sat_vec.ll | 11 +- llvm/test/CodeGen/X86/umax.ll | 5 +- ...nfold-masked-merge-vector-variablemask-const.ll | 8 +- llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll | 2 +- llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll | 4 +- .../CodeGen/X86/urem-seteq-vec-tautological.ll | 3 +- llvm/test/CodeGen/X86/usub_sat_vec.ll | 13 +- llvm/test/CodeGen/X86/var-permute-256.ll | 24 +- llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll | 6 +- llvm/test/CodeGen/X86/vec_anyext.ll | 2 +- llvm/test/CodeGen/X86/vec_cast3.ll | 3 +- llvm/test/CodeGen/X86/vec_cmp_uint-128.ll | 12 +- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 15 +- llvm/test/CodeGen/X86/vec_minmax_uint.ll | 24 +- llvm/test/CodeGen/X86/vec_smulo.ll | 6 +- llvm/test/CodeGen/X86/vec_uaddo.ll | 5 +- llvm/test/CodeGen/X86/vec_umulo.ll | 6 +- llvm/test/CodeGen/X86/vec_usubo.ll | 5 +- llvm/test/CodeGen/X86/vector-bitreverse.ll | 24 +- llvm/test/CodeGen/X86/vector-blend.ll | 4 +- .../X86/vector-constrained-fp-intrinsics.ll | 6 +- llvm/test/CodeGen/X86/vector-fshl-128.ll | 24 +- llvm/test/CodeGen/X86/vector-fshl-256.ll | 28 +- llvm/test/CodeGen/X86/vector-fshl-rot-128.ll | 8 +- llvm/test/CodeGen/X86/vector-fshl-rot-256.ll | 22 +- llvm/test/CodeGen/X86/vector-fshr-128.ll | 30 +- llvm/test/CodeGen/X86/vector-fshr-256.ll | 44 +- llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 10 +- llvm/test/CodeGen/X86/vector-fshr-rot-256.ll | 24 +- llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll | 14 +- llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll | 32 +- llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll | 10 +- llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll | 24 +- llvm/test/CodeGen/X86/vector-idiv.ll | 2 +- .../X86/vector-interleaved-load-i16-stride-3.ll | 18 +- .../X86/vector-interleaved-load-i8-stride-2.ll | 15 +- .../X86/vector-interleaved-load-i8-stride-4.ll | 56 +- .../X86/vector-interleaved-load-i8-stride-5.ll | 84 +- .../X86/vector-interleaved-load-i8-stride-6.ll | 169 ++-- .../X86/vector-interleaved-load-i8-stride-7.ll | 223 +++-- .../X86/vector-interleaved-load-i8-stride-8.ll | 240 ++--- .../X86/vector-interleaved-store-i8-stride-5.ll | 54 +- .../X86/vector-interleaved-store-i8-stride-7.ll | 3 +- .../X86/vector-interleaved-store-i8-stride-8.ll | 30 +- llvm/test/CodeGen/X86/vector-lzcnt-256.ll | 16 +- llvm/test/CodeGen/X86/vector-mul.ll | 4 +- llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll | 709 ++++++++------- llvm/test/CodeGen/X86/vector-popcnt-128.ll | 8 +- llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll | 986 ++++++++++++--------- llvm/test/CodeGen/X86/vector-popcnt-256.ll | 24 +- llvm/test/CodeGen/X86/vector-reduce-umax.ll | 12 +- llvm/test/CodeGen/X86/vector-reduce-umin.ll | 12 +- llvm/test/CodeGen/X86/vector-rotate-128.ll | 8 +- llvm/test/CodeGen/X86/vector-rotate-256.ll | 24 +- llvm/test/CodeGen/X86/vector-shift-ashr-128.ll | 18 +- llvm/test/CodeGen/X86/vector-shift-ashr-256.ll | 42 +- llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll | 12 +- llvm/test/CodeGen/X86/vector-shift-lshr-256.ll | 18 +- llvm/test/CodeGen/X86/vector-shift-shl-128.ll | 4 +- llvm/test/CodeGen/X86/vector-shift-shl-256.ll | 26 +- llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll | 4 +- llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll | 33 +- llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll | 3 +- llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll | 18 +- llvm/test/CodeGen/X86/vector-shuffle-combining.ll | 7 +- llvm/test/CodeGen/X86/vector-trunc-math.ll | 28 +- llvm/test/CodeGen/X86/vector-trunc-packus.ll | 52 +- llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 104 ++- llvm/test/CodeGen/X86/vector-trunc-usat.ll | 100 ++- llvm/test/CodeGen/X86/vector-trunc.ll | 4 +- llvm/test/CodeGen/X86/vector-tzcnt-128.ll | 16 +- llvm/test/CodeGen/X86/vector-tzcnt-256.ll | 16 +- llvm/test/CodeGen/X86/vector-unsigned-cmp.ll | 8 +- .../X86/vector_splat-const-shift-of-constmasked.ll | 16 +- llvm/test/CodeGen/X86/vselect-avx.ll | 4 +- llvm/test/CodeGen/X86/vselect-minmax.ll | 69 +- llvm/test/CodeGen/X86/vselect-pcmp.ll | 6 +- llvm/test/CodeGen/X86/x86-interleaved-access.ll | 32 +- .../X86/zero_extend_vector_inreg_of_broadcast.ll | 10 +- ...extend_vector_inreg_of_broadcast_from_memory.ll | 10 +- 127 files changed, 2508 insertions(+), 1896 deletions(-) diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp index 161978b..94e221f 100644 --- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp +++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp @@ -312,14 +312,16 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, if (ST->hasAVX2()) return ConvertToBroadcast(0, 0, X86::VPBROADCASTQrm, X86::VPBROADCASTDrm, X86::VPBROADCASTWrm, X86::VPBROADCASTBrm, 1); - return false; + return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0, + 1); case X86::VMOVDQAYrm: case X86::VMOVDQUYrm: if (ST->hasAVX2()) return ConvertToBroadcast(0, X86::VBROADCASTI128, X86::VPBROADCASTQYrm, X86::VPBROADCASTDYrm, X86::VPBROADCASTWYrm, X86::VPBROADCASTBYrm, 1); - return false; + return ConvertToBroadcast(0, X86::VBROADCASTF128, X86::VBROADCASTSDYrm, + X86::VBROADCASTSSYrm, 0, 0, 1); case X86::VMOVDQA32Z128rm: case X86::VMOVDQA64Z128rm: case X86::VMOVDQU32Z128rm: diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll index b10a6fa..e090370 100644 --- a/llvm/test/CodeGen/X86/abdu-vector-128.ll +++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll @@ -250,7 +250,8 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX1-LABEL: abd_ext_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -327,7 +328,8 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX1-LABEL: abd_ext_v2i64_undef: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -497,7 +499,8 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX1-LABEL: abd_minmax_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -664,7 +667,8 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX1-LABEL: abd_cmp_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -745,7 +749,8 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin ; ; AVX1-LABEL: abd_cmp_v2i64_multiuse_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/abdu-vector-256.ll b/llvm/test/CodeGen/X86/abdu-vector-256.ll index 3957133..884515c 100644 --- a/llvm/test/CodeGen/X86/abdu-vector-256.ll +++ b/llvm/test/CodeGen/X86/abdu-vector-256.ll @@ -221,7 +221,8 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: abd_ext_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 @@ -267,7 +268,8 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: abd_ext_v4i64_undef: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 @@ -416,7 +418,8 @@ define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: abd_minmax_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 @@ -566,7 +569,8 @@ define <4 x i64> @abd_cmp_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: abd_cmp_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index d9ce46f..9ecc89b 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -1075,7 +1075,8 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index bae04d9..5047041 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -894,7 +894,8 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/avx-logic.ll b/llvm/test/CodeGen/X86/avx-logic.ll index 8fcd70d..3b14e5a 100644 --- a/llvm/test/CodeGen/X86/avx-logic.ll +++ b/llvm/test/CodeGen/X86/avx-logic.ll @@ -314,7 +314,7 @@ define <8 x i32> @and_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 @@ -342,7 +342,7 @@ define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255] ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 @@ -450,7 +450,7 @@ define <8 x i32> @or_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 @@ -479,7 +479,7 @@ define <8 x i32> @xor_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 @@ -537,7 +537,7 @@ define <8 x i32> @or_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [65535,65535,65535,65535] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 @@ -566,7 +566,7 @@ define <8 x i32> @xor_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [65535,65535,65535,65535] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/avx-shift.ll b/llvm/test/CodeGen/X86/avx-shift.ll index 8d825782..1da78eb 100644 --- a/llvm/test/CodeGen/X86/avx-shift.ll +++ b/llvm/test/CodeGen/X86/avx-shift.ll @@ -105,9 +105,9 @@ define <32 x i8> @vshift09(<32 x i8> %a) { ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpsrlw $2, %xmm1, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; CHECK-NEXT: vpxor %xmm3, %xmm1, %xmm1 ; CHECK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; CHECK-NEXT: vpsrlw $2, %xmm0, %xmm0 @@ -138,7 +138,7 @@ define <32 x i8> @vshift11(<32 x i8> %a) { ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpsrlw $2, %xmm1, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpsrlw $2, %xmm0, %xmm0 ; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 @@ -153,7 +153,7 @@ define <32 x i8> @vshift12(<32 x i8> %a) { ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpsllw $2, %xmm1, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpsllw $2, %xmm0, %xmm0 ; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 @@ -169,7 +169,7 @@ define <8 x i32> @vshift08(<8 x i32> %a) { ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpslld $23, %xmm1, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpslld $23, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 @@ -184,7 +184,7 @@ define <8 x i32> @vshift08_add(<8 x i32> %a, <8 x i32> %y) { ; CHECK-LABEL: vshift08_add: ; CHECK: # %bb.0: ; CHECK-NEXT: vpslld $23, %xmm0, %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; CHECK-NEXT: vcvttps2dq %xmm2, %xmm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index c22ec12..544d9b2 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -458,7 +458,7 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) { ; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 @@ -812,7 +812,7 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll index aba2f2e..50747d2 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -253,7 +253,7 @@ define <32 x i1> @bitcast_i32_32i1(i32 %a0) { ; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll index 9450099..083269b 100644 --- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -19,7 +19,7 @@ define <16 x i8> @f16xi8_i16(<16 x i8> %a) { ; AVX-LABEL: f16xi8_i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl @@ -33,7 +33,7 @@ define <16 x i8> @f16xi8_i16(<16 x i8> %a) { ; ; AVX-64-LABEL: f16xi8_i16: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq @@ -124,7 +124,7 @@ define <32 x i8> @f32xi8_i16(<32 x i8> %a) { ; AVX-LABEL: f32xi8_i16: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -141,7 +141,7 @@ define <32 x i8> @f32xi8_i16(<32 x i8> %a) { ; AVX-64-LABEL: f32xi8_i16: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -288,7 +288,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) { ; AVX-LABEL: f64xi8_i16: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -320,7 +320,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) { ; AVX-64-LABEL: f64xi8_i16: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll index 72b987d..cc08c75 100644 --- a/llvm/test/CodeGen/X86/combine-add.ll +++ b/llvm/test/CodeGen/X86/combine-add.ll @@ -245,7 +245,7 @@ define void @PR52039(ptr %pa, ptr %pb) { ; ; AVX1-LABEL: PR52039: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [10,10,10,10] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [10,10,10,10] ; AVX1-NEXT: vpsubd 16(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpsubd (%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll index 35107e3..c2b9cbb 100644 --- a/llvm/test/CodeGen/X86/combine-bitreverse.ll +++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll @@ -429,7 +429,7 @@ define <4 x i32> @test_demandedbits_bitreverse(<4 x i32> %a0) nounwind { ; X64-LABEL: test_demandedbits_bitreverse: ; X64: # %bb.0: ; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X64-NEXT: vpand %xmm1, %xmm0, %xmm2 ; X64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; X64-NEXT: vpshufb %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll index 6a0dc4a..6c266be 100644 --- a/llvm/test/CodeGen/X86/combine-bitselect.ll +++ b/llvm/test/CodeGen/X86/combine-bitselect.ll @@ -466,7 +466,8 @@ define <4 x i64> @bitselect_v4i64_mm(ptr nocapture readonly, ptr nocapture reado ; XOP-LABEL: bitselect_v4i64_mm: ; XOP: # %bb.0: ; XOP-NEXT: vmovdqa (%rsi), %ymm0 -; XOP-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] +; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] +; XOP-NEXT: # ymm1 = mem[0,1,0,1] ; XOP-NEXT: vpcmov %ymm1, (%rdi), %ymm0, %ymm0 ; XOP-NEXT: retq ; @@ -1092,7 +1093,7 @@ define void @constantfold_andn_mask() nounwind { ; XOP-NEXT: pushq %rax ; XOP-NEXT: callq use@PLT ; XOP-NEXT: vmovdqu (%rax), %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [31,248,31,248,31,248,31,248,31,248,31,248,31,248,31,248] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,248,31,248,31,248,31,248,31,248,31,248,31,248,31,248] ; XOP-NEXT: vpand %xmm2, %xmm1, %xmm3 ; XOP-NEXT: vpand %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpavgb %xmm2, %xmm0, %xmm0 @@ -1110,7 +1111,7 @@ define void @constantfold_andn_mask() nounwind { ; AVX1-NEXT: pushq %rax ; AVX1-NEXT: callq use@PLT ; AVX1-NEXT: vmovdqu (%rax), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,248,31,248,31,248,31,248,31,248,31,248,31,248,31,248] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,248,31,248,31,248,31,248,31,248,31,248,31,248,31,248] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll index 4a3b765..6ec9542 100644 --- a/llvm/test/CodeGen/X86/combine-pavg.ll +++ b/llvm/test/CodeGen/X86/combine-pavg.ll @@ -33,7 +33,7 @@ define <16 x i8> @combine_pavgw_knownbits(<8 x i16> %a0, <8 x i16> %a1, <8 x i16 ; ; AVX1-LABEL: combine_pavgw_knownbits: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index c3d23f4..e1d963a 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -116,7 +116,8 @@ define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [715827883,715827883] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [715827883,715827883] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 8ede236..549fe72 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -402,7 +402,8 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -467,7 +468,8 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; XOP-NEXT: vmovddup {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; XOP-NEXT: # xmm2 = mem[0,0] ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq %1 = sdiv <16 x i8> %x, @@ -1735,7 +1737,8 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { ; ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: # xmm1 = mem[0,0] ; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm2 ; XOP-NEXT: vpsrlq $62, %xmm2, %xmm2 ; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2 @@ -1946,7 +1949,8 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64: ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: # xmm3 = mem[0,0] ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm4 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556] ; XOP-NEXT: vpshlq %xmm5, %xmm4, %xmm4 @@ -1956,7 +1960,8 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; XOP-NEXT: vpshaq %xmm3, %xmm0, %xmm6 ; XOP-NEXT: vpsrlq $62, %xmm6, %xmm6 ; XOP-NEXT: vpaddq %xmm6, %xmm0, %xmm6 -; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = +; XOP-NEXT: vmovddup {{.*#+}} xmm7 = [18446744073709551614,18446744073709551614] +; XOP-NEXT: # xmm7 = mem[0,0] ; XOP-NEXT: vpshaq %xmm7, %xmm6, %xmm6 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/combine-smax.ll b/llvm/test/CodeGen/X86/combine-smax.ll index 828a36d..a5b6a54 100644 --- a/llvm/test/CodeGen/X86/combine-smax.ll +++ b/llvm/test/CodeGen/X86/combine-smax.ll @@ -34,7 +34,7 @@ define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) { ; ; AVX1-LABEL: test_v16i8_nosignbit: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-smin.ll b/llvm/test/CodeGen/X86/combine-smin.ll index 6a69bd0..6a44c6b 100644 --- a/llvm/test/CodeGen/X86/combine-smin.ll +++ b/llvm/test/CodeGen/X86/combine-smin.ll @@ -34,7 +34,7 @@ define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) { ; ; AVX1-LABEL: test_v16i8_nosignbit: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll index 2e36ffc..8be82ef 100644 --- a/llvm/test/CodeGen/X86/combine-sub-usat.ll +++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll @@ -250,7 +250,7 @@ define <8 x i16> @combine_trunc_v8i32_v8i16(<8 x i16> %a0, <8 x i32> %a1) { ; AVX1-LABEL: combine_trunc_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index e013d8c..12ac819 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -359,7 +359,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; ; XOP-LABEL: combine_vec_udiv_by_shl_pow2a: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967294,4294967294,4294967294,4294967294] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [4294967294,4294967294,4294967294,4294967294] ; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; XOP-NEXT: vpshld %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll index 434f2627..d17ea10 100644 --- a/llvm/test/CodeGen/X86/combine-urem.ll +++ b/llvm/test/CodeGen/X86/combine-urem.ll @@ -254,7 +254,7 @@ define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) { ; AVX1-LABEL: combine_vec_urem_by_pow2d: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 ; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 @@ -385,7 +385,7 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2a(<4 x i32> %x, <4 x i32> %y) { ; AVX1-LABEL: combine_vec_urem_by_lshr_pow2a: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,4,4,4] ; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 ; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 diff --git a/llvm/test/CodeGen/X86/concat-cast.ll b/llvm/test/CodeGen/X86/concat-cast.ll index b898be5..74697c5 100644 --- a/llvm/test/CodeGen/X86/concat-cast.ll +++ b/llvm/test/CodeGen/X86/concat-cast.ll @@ -362,7 +362,8 @@ define <4 x float> @mismatch_tofp_v4i32_v4f32(<2 x i32> %x, <2 x i32> %y) { ; AVX1-LABEL: mismatch_tofp_v4i32_v4f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vsubpd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vcvtpd2ps %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll index d4dd264..d9ee5f0 100644 --- a/llvm/test/CodeGen/X86/freeze-vector.ll +++ b/llvm/test/CodeGen/X86/freeze-vector.ll @@ -280,7 +280,7 @@ define void @freeze_buildvector_single_maybe_poison_operand(ptr %origin, ptr %ds ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovdqa {{.*#+}} xmm0 = +; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42] ; X86-NEXT: vpinsrd $0, (%ecx), %xmm0, %xmm0 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%eax) @@ -313,7 +313,7 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx ; X86-NEXT: andl $15, %ecx -; X86-NEXT: vmovdqa {{.*#+}} xmm0 = +; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42] ; X86-NEXT: vpinsrd $0, %ecx, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 @@ -350,7 +350,7 @@ define void @freeze_two_frozen_buildvectors(ptr %origin0, ptr %origin1, ptr %dst ; X86-NEXT: movl (%edx), %edx ; X86-NEXT: andl $15, %edx ; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X86-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7] +; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7] ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%ecx) ; X86-NEXT: vmovd %edx, %xmm0 @@ -403,7 +403,7 @@ define void @freeze_two_buildvectors_only_one_frozen(ptr %origin0, ptr %origin1, ; X86-NEXT: vmovd %edx, %xmm1 ; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; X86-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7] +; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,7,7,7] ; X86-NEXT: vpand %xmm2, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%ecx) ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll index 6fe3de8..6cd85e0 100644 --- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll @@ -91,14 +91,14 @@ define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; GFNIAVX1-NEXT: vpsllw $4, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsllw $4, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 @@ -147,14 +147,14 @@ define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $6, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $6, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; GFNIAVX1-NEXT: vpsllw $2, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsllw $2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 @@ -212,7 +212,7 @@ define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; GFNIAVX1-NEXT: vpsrlw $7, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; GFNIAVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 @@ -292,14 +292,14 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; GFNIAVX1-NEXT: vpsrlw $2, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; GFNIAVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; GFNIAVX1-NEXT: vpsllw $6, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] ; GFNIAVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpsllw $6, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll index ff9dc24..29b58d0 100644 --- a/llvm/test/CodeGen/X86/gfni-rotates.ll +++ b/llvm/test/CodeGen/X86/gfni-rotates.ll @@ -95,7 +95,7 @@ define <32 x i8> @splatconstant_rotl_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; GFNIAVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; GFNIAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 ; GFNIAVX1-NEXT: vpsllw $4, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 @@ -151,7 +151,7 @@ define <32 x i8> @splatconstant_rotr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; GFNIAVX1-NEXT: vpsrlw $6, %xmm1, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; GFNIAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 ; GFNIAVX1-NEXT: vpsllw $2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 @@ -218,7 +218,7 @@ define <64 x i8> @splatconstant_rotl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 @@ -300,7 +300,7 @@ define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] ; GFNIAVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 ; GFNIAVX1-NEXT: vpsllw $6, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll index 14efd6a..015cc54 100644 --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -66,7 +66,7 @@ define <16 x i8> @splatconstant_ashr_v16i8(<16 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; GFNIAVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; GFNIAVX1-NEXT: retq @@ -109,7 +109,7 @@ define <32 x i8> @splatconstant_shl_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; GFNIAVX1-NEXT: vpsllw $6, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] ; GFNIAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpsllw $6, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 @@ -145,7 +145,7 @@ define <32 x i8> @splatconstant_lshr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; GFNIAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; GFNIAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 @@ -186,9 +186,9 @@ define <32 x i8> @splatconstant_ashr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; GFNIAVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; GFNIAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; GFNIAVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 @@ -240,7 +240,7 @@ define <64 x i8> @splatconstant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] ; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsllw $5, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 @@ -289,7 +289,7 @@ define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 @@ -347,9 +347,9 @@ define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; GFNIAVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll index 3af28d3..5fde9bd 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -111,7 +111,8 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X64-AVX1-LABEL: test_reduce_v2i64: ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: ## xmm2 = mem[0,0] ; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -616,7 +617,8 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-LABEL: test_reduce_v4i64: ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: ## xmm2 = mem[0,0] ; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 @@ -1366,7 +1368,8 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; ; X64-AVX1-LABEL: test_reduce_v8i64: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: ## xmm2 = mem[0,0] ; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll index 5985dca..699dce7 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -113,7 +113,8 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X64-AVX1-LABEL: test_reduce_v2i64: ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: ## xmm2 = mem[0,0] ; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -561,7 +562,8 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; ; X64-AVX1-LABEL: test_reduce_v4i64: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: ## xmm1 = mem[0,0] ; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 @@ -1285,7 +1287,8 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-LABEL: test_reduce_v8i64: ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: ## xmm3 = mem[0,0] ; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4 ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; X64-AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll index b38f5c2..9662542 100644 --- a/llvm/test/CodeGen/X86/i64-to-float.ll +++ b/llvm/test/CodeGen/X86/i64-to-float.ll @@ -353,10 +353,12 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind { ; ; X64-AVX-LABEL: clamp_sitofp_2i64_2f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551361,18446744073709551361] +; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551361,18446744073709551361] +; X64-AVX-NEXT: # xmm1 = mem[0,0] ; X64-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] +; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; X64-AVX-NEXT: # xmm1 = mem[0,0] ; X64-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll index 646ca17..0f1135566 100644 --- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll @@ -152,7 +152,8 @@ define <2 x i64> @elt0_v2i64(i64 %x) { ; ; X64-AVX1-LABEL: elt0_v2i64: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [1,1] +; X64-AVX1-NEXT: # xmm0 = mem[0,0] ; X64-AVX1-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index a6f4296..3de5e4d 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -1452,7 +1452,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] @@ -3841,7 +3841,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-LABEL: truncstore_v8i32_v8i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index ffb3142..3379bea 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -293,7 +293,8 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2147483647,2147483647] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [2147483647,2147483647] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 @@ -303,7 +304,8 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm6 @@ -669,7 +671,8 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -680,7 +683,8 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm6 @@ -1214,7 +1218,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [127,127] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -1225,7 +1230,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm6 @@ -1630,13 +1636,15 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [2147483647,2147483647] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 @@ -1859,13 +1867,15 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-LABEL: truncstore_v4i64_v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -2180,17 +2190,19 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [127,127] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] @@ -2434,10 +2446,12 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [2147483647,2147483647] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -2591,10 +2605,12 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX1-LABEL: truncstore_v2i64_v2i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -2788,10 +2804,12 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX1-LABEL: truncstore_v2i64_v2i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [127,127] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index e288692..b32f495 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -228,9 +228,11 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [4294967295,4294967295] ; AVX1-NEXT: # xmm6 = mem[0,0] @@ -542,9 +544,11 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-LABEL: truncstore_v8i64_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535] ; AVX1-NEXT: # xmm7 = mem[0,0] @@ -1016,9 +1020,11 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-LABEL: truncstore_v8i64_v8i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] ; AVX1-NEXT: # xmm7 = mem[0,0] @@ -1392,9 +1398,11 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [4294967295,4294967295] ; AVX1-NEXT: # xmm6 = mem[0,0] @@ -1588,9 +1596,11 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535] ; AVX1-NEXT: # xmm7 = mem[0,0] @@ -1870,9 +1880,11 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-LABEL: truncstore_v4i64_v4i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [255,255] ; AVX1-NEXT: # xmm6 = mem[0,0] @@ -1881,7 +1893,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] @@ -2108,7 +2120,8 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [4294967295,4294967295] ; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -2245,7 +2258,8 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [65535,65535] ; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -2422,7 +2436,8 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [255,255] ; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2826,7 +2841,7 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX1-LABEL: truncstore_v16i32_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [65535,65535,65535,65535] ; AVX1-NEXT: vpminud %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpminud %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 @@ -3533,7 +3548,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX1-LABEL: truncstore_v16i32_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [255,255,255,255] ; AVX1-NEXT: vpminud %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpminud %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 @@ -4122,7 +4137,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-LABEL: truncstore_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 @@ -4517,7 +4532,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-LABEL: truncstore_v8i32_v8i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255] ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 @@ -5675,7 +5690,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX1-LABEL: truncstore_v32i16_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpminuw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpminuw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 @@ -6669,7 +6684,7 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpminuw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpminuw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index c34ffc5..a499782 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -1117,7 +1117,8 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun ; ; AVX1-LABEL: vec128_i64_unsigned_reg_reg: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -2465,7 +2466,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero @@ -2694,7 +2695,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero @@ -2934,7 +2935,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero @@ -3174,7 +3175,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero @@ -3417,7 +3418,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 8d9ec21..4c605b1 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -399,7 +399,8 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi ; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: # xmm8 = mem[0,0] ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -457,7 +458,8 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi ; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: # xmm8 = mem[0,0] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -552,7 +554,8 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun ; AVX1-LABEL: vec256_i64_unsigned_reg_reg: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6 @@ -568,7 +571,8 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun ; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: # xmm8 = mem[0,0] ; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm4 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 @@ -630,7 +634,8 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun ; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: # xmm8 = mem[0,0] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -740,7 +745,8 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: # xmm8 = mem[0,0] ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -800,7 +806,8 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 -; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: # xmm8 = mem[0,0] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-NEXT: vpmuludq %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -910,7 +917,8 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: # xmm8 = mem[0,0] ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -970,7 +978,8 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: # xmm8 = mem[0,0] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -1081,7 +1090,8 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: # xmm8 = mem[0,0] ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -1143,7 +1153,8 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 -; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: # xmm8 = mem[0,0] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-NEXT: vpmuludq %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -1259,7 +1270,7 @@ define <16 x i16> @vec256_i16_signed_reg_reg(<16 x i16> %a1, <16 x i16> %a2) nou ; AVX1-NEXT: vpsubw %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 @@ -1295,7 +1306,7 @@ define <16 x i16> @vec256_i16_signed_reg_reg(<16 x i16> %a1, <16 x i16> %a2) nou ; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 ; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2 -; XOP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm2, %xmm2 @@ -1383,7 +1394,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n ; AVX1-NEXT: vpsubw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 @@ -1421,7 +1432,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n ; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 ; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2 -; XOP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm2, %xmm2 @@ -1511,7 +1522,7 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw ; AVX1-NEXT: vpsubw %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 @@ -1549,7 +1560,7 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw ; XOP-NEXT: vpsubw %xmm6, %xmm0, %xmm0 ; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm1, %xmm1 @@ -1639,7 +1650,7 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw ; AVX1-NEXT: vpsubw %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 @@ -1677,7 +1688,7 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw ; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 ; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2 -; XOP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm2, %xmm2 @@ -1768,7 +1779,7 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin ; AVX1-NEXT: vpsubw %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 @@ -1808,7 +1819,7 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin ; XOP-NEXT: vpsubw %xmm6, %xmm0, %xmm0 ; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm1, %xmm1 @@ -1906,16 +1917,16 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX1-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3 ; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero @@ -1975,7 +1986,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6 @@ -2091,16 +2102,16 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX1-NEXT: vpmaxub %xmm3, %xmm2, %xmm3 ; AVX1-NEXT: vpsubb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm4, %xmm8, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm4 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero @@ -2162,7 +2173,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6 @@ -2280,16 +2291,16 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX1-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3 ; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero @@ -2351,7 +2362,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpshlb %xmm6, %xmm0, %xmm0 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6 @@ -2469,16 +2480,16 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero @@ -2540,7 +2551,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6 @@ -2659,16 +2670,16 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero @@ -2732,7 +2743,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpshlb %xmm6, %xmm0, %xmm0 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6 diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll index 0638836..3a73ca1 100644 --- a/llvm/test/CodeGen/X86/paddus.ll +++ b/llvm/test/CodeGen/X86/paddus.ll @@ -219,7 +219,7 @@ define <32 x i8> @test8(<32 x i8> %x) { ; AVX1-LABEL: test8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpaddusb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddusb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -251,7 +251,7 @@ define <32 x i8> @test9(<32 x i8> %x) { ; AVX1-LABEL: test9: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] ; AVX1-NEXT: vpaddusb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddusb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -283,7 +283,7 @@ define <32 x i8> @test10(<32 x i8> %x) { ; AVX1-LABEL: test10: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] ; AVX1-NEXT: vpaddusb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddusb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -373,7 +373,7 @@ define <32 x i8> @test12(<32 x i8> %x) { ; AVX1-LABEL: test12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX1-NEXT: vpaddusb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddusb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -474,7 +474,7 @@ define <64 x i8> @test14(<64 x i8> %x) { ; AVX1-LABEL: test14: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpaddusb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddusb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -514,7 +514,7 @@ define <64 x i8> @test15(<64 x i8> %x) { ; AVX1-LABEL: test15: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] ; AVX1-NEXT: vpaddusb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddusb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -554,7 +554,7 @@ define <64 x i8> @test16(<64 x i8> %x) { ; AVX1-LABEL: test16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] ; AVX1-NEXT: vpaddusb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddusb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -682,7 +682,7 @@ define <64 x i8> @test18(<64 x i8> %x) { ; AVX1-LABEL: test18: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX1-NEXT: vpaddusb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddusb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -939,7 +939,7 @@ define <16 x i16> @test26(<16 x i16> %x) { ; AVX1-LABEL: test26: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767] ; AVX1-NEXT: vpaddusw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddusw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -971,7 +971,7 @@ define <16 x i16> @test27(<16 x i16> %x) { ; AVX1-LABEL: test27: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32769,32769,32769,32769,32769,32769,32769,32769] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32769,32769,32769,32769,32769,32769,32769,32769] ; AVX1-NEXT: vpaddusw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddusw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1003,7 +1003,7 @@ define <16 x i16> @test28(<16 x i16> %x) { ; AVX1-LABEL: test28: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534] ; AVX1-NEXT: vpaddusw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddusw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1125,7 +1125,7 @@ define <16 x i16> @test30(<16 x i16> %x) { ; AVX1-LABEL: test30: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] ; AVX1-NEXT: vpaddusw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddusw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1226,7 +1226,7 @@ define <32 x i16> @test32(<32 x i16> %x) { ; AVX1-LABEL: test32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767] ; AVX1-NEXT: vpaddusw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddusw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -1266,7 +1266,7 @@ define <32 x i16> @test33(<32 x i16> %x) { ; AVX1-LABEL: test33: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769] ; AVX1-NEXT: vpaddusw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddusw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -1306,7 +1306,7 @@ define <32 x i16> @test34(<32 x i16> %x) { ; AVX1-LABEL: test34: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65534,65534,65534,65534,65534,65534,65534,65534] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65534,65534,65534,65534,65534,65534,65534,65534] ; AVX1-NEXT: vpaddusw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddusw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -1490,7 +1490,7 @@ define <32 x i16> @test36(<32 x i16> %x) { ; AVX1-LABEL: test36: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,2,2,2,2,2,2,2] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,2,2,2,2,2,2,2] ; AVX1-NEXT: vpaddusw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddusw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll index a553fbe..ea0b4e4 100644 --- a/llvm/test/CodeGen/X86/pmaddubsw.ll +++ b/llvm/test/CodeGen/X86/pmaddubsw.ll @@ -320,7 +320,8 @@ define <8 x i16> @pmaddubsw_bad_extend(ptr %Aptr, ptr %Bptr) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] diff --git a/llvm/test/CodeGen/X86/pr31773.ll b/llvm/test/CodeGen/X86/pr31773.ll index 2089d53..68f9e96 100644 --- a/llvm/test/CodeGen/X86/pr31773.ll +++ b/llvm/test/CodeGen/X86/pr31773.ll @@ -8,7 +8,7 @@ define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) { ; AVX-LABEL: usat_trunc_wb_256: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX-NEXT: vpminuw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpminuw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -30,7 +30,7 @@ define <8 x i16> @usat_trunc_dw_256(<8 x i32> %i) { ; AVX-LABEL: usat_trunc_dw_256: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [65535,65535,65535,65535] ; AVX-NEXT: vpminud %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index b4b2adb..4fa5ef2 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -131,7 +131,7 @@ define <4 x i32> @ashr_xor_and_custom(<4 x i32> %x) nounwind { ; ; AVX1-LABEL: ashr_xor_and_custom: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -172,7 +172,7 @@ define <4 x i32> @ashr_add_and_custom(<4 x i32> %x) nounwind { ; ; AVX1-LABEL: ashr_add_and_custom: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -215,7 +215,7 @@ define <4 x i32> @usubsat_custom(<4 x i32> %x) nounwind { ; ; AVX1-LABEL: usubsat_custom: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <2147483648,2147483648,2147483648,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -391,7 +391,7 @@ define <16 x i16> @test7(<16 x i16> %x) nounwind { ; AVX1-LABEL: test7: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -424,7 +424,7 @@ define <16 x i16> @ashr_xor_and_v16i16(<16 x i16> %x) nounwind { ; AVX1-LABEL: ashr_xor_and_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -456,7 +456,7 @@ define <16 x i16> @ashr_add_and_v16i16(<16 x i16> %x) nounwind { ; AVX1-LABEL: ashr_add_and_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -488,7 +488,7 @@ define <16 x i16> @test8(<16 x i16> %x) nounwind { ; AVX1-LABEL: test8: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767] ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -594,7 +594,7 @@ define <32 x i8> @test10(<32 x i8> %x) nounwind { ; AVX1-LABEL: test10: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -627,7 +627,7 @@ define <32 x i8> @test11(<32 x i8> %x) nounwind { ; AVX1-LABEL: test11: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -803,7 +803,7 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind { ; AVX1-LABEL: test13: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 @@ -1069,7 +1069,7 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind { ; AVX1-LABEL: test15: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 @@ -1592,7 +1592,7 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind { ; AVX1-LABEL: psubus_8i32_max: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 @@ -1742,9 +1742,11 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; AVX1-LABEL: psubus_8i64_max: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535] ; AVX1-NEXT: # xmm7 = mem[0,0] @@ -1862,7 +1864,7 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { ; AVX1-LABEL: psubus_16i32_max: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [65535,65535,65535,65535] ; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 @@ -1962,7 +1964,7 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin ; AVX1-LABEL: psubus_i16_i32_max_swapped: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 @@ -2057,7 +2059,7 @@ define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind { ; AVX1-LABEL: psubus_i16_i32_min: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 @@ -2303,7 +2305,7 @@ define <32 x i8> @test23(<32 x i8> %x) { ; AVX1-LABEL: test23: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70] ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2367,7 +2369,7 @@ define <16 x i16> @test25(<16 x i16> %x) { ; AVX1-LABEL: test25: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000] ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2433,7 +2435,7 @@ define <64 x i8> @test27(<64 x i8> %x) { ; AVX1-LABEL: test27: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] ; AVX1-NEXT: vpsubusb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsubusb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -2646,7 +2648,7 @@ define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) { ; AVX1-LABEL: test32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 @@ -2800,9 +2802,11 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; AVX1-LABEL: test33: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295] ; AVX1-NEXT: # xmm7 = mem[0,0] @@ -3025,9 +3029,11 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; AVX1: # %bb.0: ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295] ; AVX1-NEXT: # xmm7 = mem[0,0] diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll index 3207fe9..8d914ba 100644 --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -515,14 +515,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX1-LABEL: v16i4: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index 0177acf..cf41a91 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -2134,7 +2134,7 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X86-AVX1-NEXT: imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-AVX1-NEXT: # imm = 0x2007 ; X86-AVX1-NEXT: movl %eax, (%eax) -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [8199,8199,8199,8199] ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vmovdqa %xmm1, (%eax) @@ -2337,7 +2337,7 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X64-AVX1-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpinsrd $2, %ebx, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpinsrd $3, %r11d, %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8199,8199,8199,8199] ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovd %r10d, %xmm2 ; X64-AVX1-NEXT: vpinsrd $1, %r9d, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll index 3b9c5a3..42f1bd7 100644 --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll @@ -16,7 +16,8 @@ define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -150,7 +151,7 @@ define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -210,7 +211,7 @@ define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -270,7 +271,7 @@ define void @shuffle_v32i8_to_v8i8_3(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -537,7 +538,7 @@ define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -597,7 +598,7 @@ define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -657,7 +658,7 @@ define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -717,7 +718,7 @@ define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -777,7 +778,7 @@ define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -837,7 +838,7 @@ define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -897,7 +898,7 @@ define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 07e1c56..7fdc7e9 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -20,7 +20,7 @@ define void @shuffle_v32i8_to_v16i8(ptr %L, ptr %S) nounwind { ; AVX1-LABEL: shuffle_v32i8_to_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand 16(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpand (%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -386,7 +386,7 @@ define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -451,7 +451,7 @@ define void @trunc_v8i32_to_v8i8(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -518,7 +518,7 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind { ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -691,7 +691,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind { ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1020,7 +1020,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind { ; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1193,7 +1193,7 @@ define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1258,7 +1258,7 @@ define void @trunc_v4i64_to_v4i8(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll index de7da3f9..5b54d94 100644 --- a/llvm/test/CodeGen/X86/splat-for-size.ll +++ b/llvm/test/CodeGen/X86/splat-for-size.ll @@ -274,7 +274,7 @@ define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 { ; AVX-LABEL: splat_v16i16: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -293,7 +293,7 @@ define <16 x i16> @splat_v16i16_pgso(<16 x i16> %x) !prof !14 { ; AVX-LABEL: splat_v16i16_pgso: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -344,7 +344,7 @@ define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 { ; AVX-LABEL: splat_v32i8: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -363,7 +363,7 @@ define <32 x i8> @splat_v32i8_pgso(<32 x i8> %x) !prof !14 { ; AVX-LABEL: splat_v32i8_pgso: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index fb4ad4a..47f7555 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -2362,12 +2362,12 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 ; CHECK-AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4 ; CHECK-AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; CHECK-AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 ; CHECK-AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm4 ; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm6 -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6 ; CHECK-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll index b042f12..d2a1e5e 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll @@ -343,7 +343,7 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind { ; CHECK-AVX1-LABEL: test_srem_odd_undef1: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] ; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] @@ -454,7 +454,7 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind { ; CHECK-AVX1-LABEL: test_srem_even_undef1: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] ; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll index 056cb2c..14f1985 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -515,14 +515,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX1-LABEL: v16i4: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll index b99e139..f97603e 100644 --- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -492,7 +492,7 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; ; AVX1-LABEL: v16i4: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -914,7 +914,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; ; AVX1-LABEL: v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm1 @@ -990,7 +991,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX1-LABEL: v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 @@ -1100,7 +1102,8 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; AVX1-LABEL: v8i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 ; AVX1-NEXT: vpaddq %xmm7, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll index 2e48307..2c93f6a 100644 --- a/llvm/test/CodeGen/X86/umax.ll +++ b/llvm/test/CodeGen/X86/umax.ll @@ -371,7 +371,8 @@ define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX1-LABEL: test_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -789,7 +790,7 @@ define <8 x i32> @test_v8i32_1(<8 x i32> %a) nounwind { ; AVX1-LABEL: test_v8i32_1: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] ; AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll index cca56dc..00d1228 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll @@ -251,7 +251,7 @@ define <4 x i32> @out_constant_varx_42_invmask(ptr%px, ptr%py, ptr%pmask) { ; CHECK-XOP-LABEL: out_constant_varx_42_invmask: ; CHECK-XOP: # %bb.0: ; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 -; CHECK-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42,42,42] +; CHECK-XOP-NEXT: vbroadcastss {{.*#+}} xmm1 = [42,42,42,42] ; CHECK-XOP-NEXT: vpcmov %xmm0, (%rdi), %xmm1, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, ptr%px, align 16 @@ -289,7 +289,7 @@ define <4 x i32> @in_constant_varx_42_invmask(ptr%px, ptr%py, ptr%pmask) { ; CHECK-XOP-LABEL: in_constant_varx_42_invmask: ; CHECK-XOP: # %bb.0: ; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 -; CHECK-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42,42,42] +; CHECK-XOP-NEXT: vbroadcastss {{.*#+}} xmm1 = [42,42,42,42] ; CHECK-XOP-NEXT: vpcmov %xmm0, (%rdi), %xmm1, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, ptr%px, align 16 @@ -459,7 +459,7 @@ define <4 x i32> @out_constant_42_vary(ptr%px, ptr%py, ptr%pmask) { ; CHECK-XOP-LABEL: out_constant_42_vary: ; CHECK-XOP: # %bb.0: ; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 -; CHECK-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42,42,42] +; CHECK-XOP-NEXT: vbroadcastss {{.*#+}} xmm1 = [42,42,42,42] ; CHECK-XOP-NEXT: vpcmov %xmm0, (%rsi), %xmm1, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, ptr%px, align 16 @@ -496,7 +496,7 @@ define <4 x i32> @in_constant_42_vary(ptr%px, ptr%py, ptr%pmask) { ; CHECK-XOP-LABEL: in_constant_42_vary: ; CHECK-XOP: # %bb.0: ; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 -; CHECK-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42,42,42] +; CHECK-XOP-NEXT: vbroadcastss {{.*#+}} xmm1 = [42,42,42,42] ; CHECK-XOP-NEXT: vpcmov %xmm0, (%rsi), %xmm1, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, ptr%px, align 16 diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll index a3c7c40..f2819bc 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -194,7 +194,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; AVX1-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 ; AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2047,2047,2047,2047] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2047,2047,2047,2047] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll index 0b9a413..12c1fe9 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll @@ -287,7 +287,7 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind { ; CHECK-AVX1-LABEL: test_urem_odd_undef1: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] @@ -382,7 +382,7 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind { ; CHECK-AVX1-LABEL: test_urem_even_undef1: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll index 30441fa..b2b895f 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll @@ -221,7 +221,8 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind { ; ; CHECK-AVX1-LABEL: t3_wide: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] +; CHECK-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] +; CHECK-AVX1-NEXT: # xmm1 = mem[0,0] ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll index 705019b..a9cf029 100644 --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -491,7 +491,7 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; ; AVX1-LABEL: v16i4: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 @@ -823,7 +823,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; ; AVX1-LABEL: v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -894,7 +895,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX1-LABEL: v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 @@ -997,7 +999,8 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; AVX1-LABEL: v8i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 ; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm8 @@ -1113,7 +1116,7 @@ define void @PR48223(ptr %p0) { ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64] ; AVX1-NEXT: vpsubusw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsubusw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpsubusw %xmm4, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll index ea337ef..6c07c4c 100644 --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -133,8 +133,8 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind { define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind { ; XOP-LABEL: var_shuffle_v16i16: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256] -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514] ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1 @@ -146,14 +146,14 @@ define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwi ; ; AVX1-LABEL: var_shuffle_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256] ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm6 @@ -275,7 +275,7 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX1-LABEL: var_shuffle_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm6 @@ -679,8 +679,8 @@ entry: define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indices) nounwind { ; XOP-LABEL: var_shuffle_v16i16_from_v8i16: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256] -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514] ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1 @@ -691,14 +691,14 @@ define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indic ; ; AVX1-LABEL: var_shuffle_v16i16_from_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256] ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm5 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 @@ -820,7 +820,7 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; AVX1-LABEL: var_shuffle_v32i8_from_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll index cea5523..7c788d2 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll @@ -85,7 +85,8 @@ define <2 x float> @uitofp_v2i32_v2f32(<2 x i32> %x) #0 { ; AVX1-LABEL: uitofp_v2i32_v2f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vcvtpd2ps %xmm0, %xmm0 @@ -1099,7 +1100,8 @@ define <2 x double> @uitofp_v2i32_v2f64(<2 x i32> %x) #0 { ; AVX1-LABEL: uitofp_v2i32_v2f64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll index 020f8ea..cdd3016 100644 --- a/llvm/test/CodeGen/X86/vec_anyext.ll +++ b/llvm/test/CodeGen/X86/vec_anyext.ll @@ -173,7 +173,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vmovdqa (%ecx), %xmm0 ; X86-NEXT: vmovdqa 16(%ecx), %xmm1 -; X86-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; X86-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; X86-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] diff --git a/llvm/test/CodeGen/X86/vec_cast3.ll b/llvm/test/CodeGen/X86/vec_cast3.ll index 1596316..43bb538 100644 --- a/llvm/test/CodeGen/X86/vec_cast3.ll +++ b/llvm/test/CodeGen/X86/vec_cast3.ll @@ -54,7 +54,8 @@ define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) { ; CHECK-LABEL: cvt_v2u32_v2f32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; CHECK-NEXT: ## xmm1 = mem[0,0] ; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll index 1cff56e..9a0756e 100644 --- a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll +++ b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll @@ -332,7 +332,8 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX1-LABEL: ge_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 @@ -508,7 +509,8 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX1-LABEL: gt_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 @@ -754,7 +756,8 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX1-LABEL: le_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 @@ -931,7 +934,8 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX1-LABEL: lt_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 7b1fd08..8cf6045 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -55,7 +55,8 @@ define <2 x float> @uitofp_2i32_to_2f32(<2 x i32> %a) { ; AVX1-LABEL: uitofp_2i32_to_2f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vcvtpd2ps %xmm0, %xmm0 @@ -679,7 +680,8 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) { ; AVX1-LABEL: uitofp_2i32_to_2f64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -3363,7 +3365,8 @@ define <2 x double> @uitofp_load_2i32_to_2f64(ptr%a) { ; AVX1-LABEL: uitofp_load_2i32_to_2f64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -5660,10 +5663,12 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 { ; AVX1-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; AVX1-NEXT: # xmm6 = mem[0,0] diff --git a/llvm/test/CodeGen/X86/vec_minmax_uint.ll b/llvm/test/CodeGen/X86/vec_minmax_uint.ll index 76faaca..3ddc882a 100644 --- a/llvm/test/CodeGen/X86/vec_minmax_uint.ll +++ b/llvm/test/CodeGen/X86/vec_minmax_uint.ll @@ -62,7 +62,8 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) { ; ; AVX1-LABEL: max_gt_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -177,7 +178,8 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: max_gt_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 @@ -468,7 +470,8 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) { ; ; AVX1-LABEL: max_ge_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -583,7 +586,8 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: max_ge_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 @@ -873,7 +877,8 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) { ; ; AVX1-LABEL: min_lt_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -988,7 +993,8 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: min_lt_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 @@ -1281,7 +1287,8 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) { ; ; AVX1-LABEL: min_le_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1396,7 +1403,8 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: min_le_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index eb8627e..1792a0f 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -1393,7 +1393,7 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm3 @@ -1759,7 +1759,7 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpmulhw %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm5 ; AVX1-NEXT: vpackuswb %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm4 @@ -2427,7 +2427,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpmulhw %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm7 ; AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm8 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpackuswb %xmm8, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 3ee9292..6fa02c4 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -857,7 +857,8 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; ; AVX1-LABEL: uaddo_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0 @@ -1096,7 +1097,7 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; ; AVX1-LABEL: uaddo_v4i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 3e8ee21..63e487b 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1169,7 +1169,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -1498,7 +1498,7 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm4 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -2099,7 +2099,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm4 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index 49d169c..999ceac 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -904,7 +904,8 @@ define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; ; AVX1-LABEL: usubo_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0 @@ -1143,7 +1144,7 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; ; AVX1-LABEL: usubo_v4i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll index ace5b3d..d3f357c 100644 --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -449,7 +449,7 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { ; ; AVX1-LABEL: test_bitreverse_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -553,7 +553,7 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { ; AVX1-LABEL: test_bitreverse_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -666,7 +666,7 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { ; AVX1-LABEL: test_bitreverse_v4i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -781,7 +781,7 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { ; AVX1-LABEL: test_bitreverse_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -912,7 +912,7 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { ; AVX1-LABEL: test_bitreverse_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 @@ -1090,7 +1090,7 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 @@ -1290,7 +1290,7 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 @@ -1494,7 +1494,7 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 @@ -1727,7 +1727,7 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { ; AVX1-LABEL: test_bitreverse_v64i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 @@ -2026,7 +2026,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 @@ -2384,7 +2384,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 @@ -2750,7 +2750,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll index eaa3790..502dc9c 100644 --- a/llvm/test/CodeGen/X86/vector-blend.ll +++ b/llvm/test/CodeGen/X86/vector-blend.ll @@ -86,7 +86,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { ; ; AVX1-LABEL: vsel_4xi8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255] ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -270,7 +270,7 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { ; ; AVX1-LABEL: vsel_i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index 9a43d31..cdabd7f 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -6849,7 +6849,8 @@ define <2 x double> @constrained_vector_uitofp_v2f64_v2i32(<2 x i32> %x) #0 { ; AVX1-LABEL: constrained_vector_uitofp_v2f64_v2i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -6883,7 +6884,8 @@ define <2 x float> @constrained_vector_uitofp_v2f32_v2i32(<2 x i32> %x) #0 { ; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vcvtpd2ps %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index 2feafb8..35689ec 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -67,7 +67,8 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX1-LABEL: var_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 @@ -154,7 +155,8 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: # xmm3 = mem[0,0] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -266,7 +268,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; AVX1-LABEL: var_funnnel_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 @@ -361,7 +363,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v4i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -485,7 +487,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 ; AVX1-NEXT: vpmulld %xmm4, %xmm3, %xmm3 @@ -585,7 +587,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v8i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -738,7 +740,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero @@ -888,7 +890,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4 @@ -981,7 +983,8 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; ; AVX1-LABEL: splatvar_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1063,7 +1066,8 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; ; XOPAVX1-LABEL: splatvar_funnnel_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: # xmm3 = mem[0,0] ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index 13cbd95..bdbc7bf 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -163,7 +163,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) ; AVX1: # %bb.0: ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [31,31,31,31] ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 @@ -180,7 +180,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm9[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 @@ -282,7 +282,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm3, %xmm3 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4294967265,4294967265,4294967265,4294967265] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [4294967265,4294967265,4294967265,4294967265] ; XOPAVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 ; XOPAVX1-NEXT: vpsrld $1, %xmm6, %xmm6 @@ -320,7 +320,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm7, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7 ; AVX1-NEXT: vpmulld %xmm7, %xmm5, %xmm5 @@ -435,7 +435,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; XOPAVX1-NEXT: vpshlw %xmm4, %xmm3, %xmm3 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [65521,65521,65521,65521,65521,65521,65521,65521] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [65521,65521,65521,65521,65521,65521,65521,65521] ; XOPAVX1-NEXT: vpaddw %xmm5, %xmm4, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 ; XOPAVX1-NEXT: vpsrlw $1, %xmm6, %xmm6 @@ -483,7 +483,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm7 ; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero @@ -698,7 +698,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm3, %xmm3 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249] ; XOPAVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm7 ; XOPAVX1-NEXT: vpshlb %xmm7, %xmm3, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 @@ -743,7 +743,8 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsrlq $1, %xmm5, %xmm5 @@ -831,7 +832,8 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: # xmm3 = mem[0,0] ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; XOPAVX1-NEXT: vpsrlq $1, %xmm5, %xmm5 @@ -2245,14 +2247,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 @@ -2317,12 +2319,12 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index 08402ab..37d4f3b 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -69,7 +69,8 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; ; AVX1-LABEL: var_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] @@ -351,7 +352,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -739,7 +740,8 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind ; ; AVX1-LABEL: splatvar_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index e2fe10b..d03b0c2 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -37,7 +37,8 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [63,63] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] @@ -129,10 +130,10 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] @@ -236,11 +237,11 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero @@ -373,7 +374,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5 @@ -382,7 +383,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm7 ; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7 @@ -390,7 +391,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm8 ; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3 @@ -570,7 +571,8 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5 @@ -1625,7 +1627,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index 5e6d79b..ea54d05 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -67,7 +67,8 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX1-LABEL: var_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] @@ -155,7 +156,8 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: # xmm3 = mem[0,0] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4 @@ -267,7 +269,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; AVX1-LABEL: var_funnnel_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5 @@ -363,7 +365,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v4i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4 @@ -527,7 +529,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; AVX1-LABEL: var_funnnel_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5 ; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4 @@ -546,7 +548,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero @@ -643,7 +645,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v8i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm4 @@ -840,7 +842,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; AVX1-LABEL: var_funnnel_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4 ; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5 @@ -992,7 +994,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v16i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4 @@ -1101,7 +1103,8 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; ; AVX1-LABEL: splatvar_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1184,7 +1187,8 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; ; XOPAVX1-LABEL: splatvar_funnnel_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: # xmm3 = mem[0,0] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1461,7 +1465,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 @@ -2007,7 +2011,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 624e3e8..053b3bb 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -177,10 +177,10 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) ; AVX1-NEXT: vpsrld %xmm8, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5],xmm5[6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [31,31,31,31] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [31,31,31,31] ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 @@ -286,7 +286,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) ; XOPAVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 ; XOPAVX1-NEXT: vpshld %xmm5, %xmm6, %xmm5 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [31,31,31,31] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [31,31,31,31] ; XOPAVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 ; XOPAVX1-NEXT: vpaddd %xmm7, %xmm7, %xmm7 @@ -335,11 +335,11 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm6 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm6 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm7 ; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero @@ -466,7 +466,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; XOPAVX1-NEXT: vpsubw %xmm3, %xmm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 ; XOPAVX1-NEXT: vpshlw %xmm5, %xmm6, %xmm5 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15] ; XOPAVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 ; XOPAVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7 @@ -512,16 +512,16 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm5 ; AVX1-NEXT: vpsllw $4, %xmm5, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm6 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8 ; AVX1-NEXT: vpsllw $5, %xmm8, %xmm8 ; AVX1-NEXT: vpblendvb %xmm8, %xmm6, %xmm5, %xmm6 ; AVX1-NEXT: vpsllw $2, %xmm6, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm9 ; AVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 ; AVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm6, %xmm6 @@ -530,17 +530,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm6, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8 ; AVX1-NEXT: vpsrlw $4, %xmm8, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm10, %xmm9, %xmm9 ; AVX1-NEXT: vpsllw $5, %xmm7, %xmm7 ; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm8 ; AVX1-NEXT: vpsrlw $2, %xmm8, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX1-NEXT: vpand %xmm11, %xmm9, %xmm9 ; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 ; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm8 ; AVX1-NEXT: vpsrlw $1, %xmm8, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm12, %xmm9, %xmm9 ; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 ; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm7 @@ -726,7 +726,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 ; XOPAVX1-NEXT: vpshlb %xmm5, %xmm6, %xmm5 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; XOPAVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 ; XOPAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 @@ -774,7 +774,8 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5 @@ -863,7 +864,8 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: # xmm3 = mem[0,0] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5 @@ -1142,7 +1144,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 @@ -1614,7 +1616,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [128,1,2,4,8,16,32,64] ; AVX1-NEXT: vpmullw %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm7 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [128,64,32,16,8,4,2,1] @@ -2048,14 +2050,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 @@ -2120,12 +2122,12 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index 0dab0a4..9c5fe49 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -69,7 +69,8 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; ; AVX1-LABEL: var_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] @@ -369,7 +370,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -766,7 +767,8 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind ; ; AVX1-LABEL: splatvar_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -1117,7 +1119,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index 755c098..8c196c5 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -37,7 +37,8 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [63,63] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] @@ -136,10 +137,10 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [31,31,31,31] ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] @@ -251,11 +252,11 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero @@ -394,7 +395,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5 @@ -405,7 +406,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX1-NEXT: vpandn %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 @@ -413,7 +414,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm9 ; AVX1-NEXT: vpor %xmm3, %xmm9, %xmm3 @@ -601,7 +602,8 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlq %xmm3, %xmm4, %xmm5 @@ -913,7 +915,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 @@ -1676,7 +1678,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll index 511203c..6d51310 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -116,7 +116,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { ; AVX1-LABEL: test_div7_4i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] ; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] @@ -194,7 +194,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632] ; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -204,7 +204,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -494,7 +494,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { ; AVX1-LABEL: test_rem7_4i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] ; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] @@ -587,7 +587,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632] ; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -597,7 +597,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -767,7 +767,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll index 549c681..bb8e673 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -90,7 +90,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] ; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] @@ -132,7 +132,7 @@ define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind { ; AVX1-LABEL: test_div7_16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] ; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm3 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1 @@ -161,7 +161,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632] ; AVX1-NEXT: vpmulhw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -170,12 +170,12 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX1-NEXT: vpxor %xmm7, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm7, %xmm1, %xmm1 @@ -268,7 +268,7 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] @@ -458,7 +458,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] ; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] @@ -509,7 +509,7 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind { ; AVX1-LABEL: test_rem7_16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] ; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsrlw $15, %xmm3, %xmm4 ; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3 @@ -546,7 +546,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632] ; AVX1-NEXT: vpmulhw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -555,17 +555,17 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX1-NEXT: vpxor %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpsubb %xmm8, %xmm3, %xmm3 ; AVX1-NEXT: vpsllw $3, %xmm3, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm5 ; AVX1-NEXT: vpsubb %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 @@ -670,12 +670,12 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 ; AVX1-NEXT: vpackuswb %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index 8f9f93a..6f85fd5 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -112,7 +112,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { ; AVX1-LABEL: test_div7_4i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] ; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] @@ -209,7 +209,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -565,7 +565,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { ; AVX1-LABEL: test_rem7_4i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] ; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] @@ -684,7 +684,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -870,7 +870,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll index b38de74..fb5f520 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -92,7 +92,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind { ; AVX1-LABEL: test_div7_8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] ; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] @@ -134,7 +134,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind { define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind { ; AVX1-LABEL: test_div7_16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363] ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 @@ -167,7 +167,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37] ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero @@ -176,11 +176,11 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 @@ -489,7 +489,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757] ; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] @@ -540,7 +540,7 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind { ; AVX1-LABEL: test_rem7_16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] ; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4 @@ -580,7 +580,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37] ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero @@ -589,14 +589,14 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm5 ; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpsllw $3, %xmm3, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 @@ -701,7 +701,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-idiv.ll b/llvm/test/CodeGen/X86/vector-idiv.ll index 33779a9c..3ff3f8d 100644 --- a/llvm/test/CodeGen/X86/vector-idiv.ll +++ b/llvm/test/CodeGen/X86/vector-idiv.ll @@ -55,7 +55,7 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind { ; AVX1-LABEL: PR20355: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1431655766,1431655766,1431655766,1431655766] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1431655766,1431655766,1431655766,1431655766] ; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index 6dda700..a2a3f9e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -988,14 +988,16 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm14 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm11[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm11 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm11[5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] @@ -1007,7 +1009,8 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 @@ -1827,7 +1830,8 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm0 = xmm10[0,1],mem[2],xmm10[3,4],mem[5],xmm10[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7] @@ -1843,7 +1847,8 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2],mem[3,4],xmm0[5],mem[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] @@ -1886,7 +1891,8 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2],xmm0[3,4],mem[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll index 84f6952..d9e15ec 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -176,13 +176,14 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX1-ONLY-LABEL: load_i8_stride2_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -269,7 +270,7 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX1-ONLY-LABEL: load_i8_stride2_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 @@ -280,7 +281,8 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpackuswb %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] @@ -415,7 +417,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX1-ONLY-LABEL: load_i8_stride2_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 @@ -436,7 +438,8 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm8, %xmm12 ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vpackuswb %xmm12, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index f42d49c..d995051 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -218,21 +218,21 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX1-ONLY-LABEL: load_i8_stride4_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -407,7 +407,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX1-ONLY-LABEL: load_i8_stride4_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 @@ -415,34 +415,34 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm6 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -775,13 +775,13 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX1-ONLY-LABEL: load_i8_stride4_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 @@ -801,11 +801,11 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm11 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm13 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] @@ -819,11 +819,11 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm14 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] @@ -837,11 +837,11 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] @@ -1541,7 +1541,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-LABEL: load_i8_stride4_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 @@ -1550,7 +1550,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm8 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5 @@ -1610,13 +1610,13 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -1662,12 +1662,12 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] @@ -1703,13 +1703,13 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index 5e0f383..99db42b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -152,7 +152,7 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX1-ONLY-LABEL: load_i8_stride5_vf4: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm3 @@ -3145,7 +3145,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-LABEL: load_i8_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 @@ -3153,15 +3153,17 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3189,14 +3191,18 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <1,6,11,128,128,128,128,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,128,0,5,10,15,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm8 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm5, %xmm2 @@ -3204,7 +3210,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 @@ -3213,16 +3220,19 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3237,7 +3247,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11] ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm7 @@ -3276,14 +3286,17 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[2,7,12] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm11 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [9,14,0,128,128,128,128,4,9,14,0,128,128,128,128,4] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,0,0,5,10,15,128,128,128,0,0,5,10,15,128] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[3,4,5,6,7,8,9,u,u,u,u,u,u] @@ -3292,7 +3305,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7] ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm8, %ymm15 ; AVX1-ONLY-NEXT: vorps %ymm15, %ymm12, %ymm12 @@ -3323,7 +3336,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[3,8,13] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,0,1,6,11,128,128,128,128,0,1,6,11,128,128] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u],zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u] @@ -3338,10 +3352,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <2,7,12,128,128,128,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4,5,6,7] @@ -3383,7 +3399,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,2,7,12,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,2,7,12,0,0,128,128,128,2,7,12,0,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm5[u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6,7] @@ -3451,37 +3468,44 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,6,11,128,128,128,128,0,1,6,11,128,128,128,128] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u],zero,zero,zero,xmm8[2,7,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm8 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[4,9,14],zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [128,1,6,11,0,0,128,128,128,1,6,11,0,0,128,128] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,3,8,13,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,9,14,128,128,128,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm15, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm5, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index b15879a..b017965 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -383,7 +383,8 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,4,128,128,128,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,1,2,3,4,128,128,128] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 @@ -1730,7 +1731,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9 @@ -1741,9 +1742,9 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm3, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm12 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm10[0] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm8[5,11,u,u,u,u,u,u,u,u,u,u,u] @@ -1789,10 +1790,12 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <128,128,128,2,8,14,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,6,12,128,128,128,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm5 @@ -1819,9 +1822,11 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm5, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm10, %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,3,9,15,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <1,7,13,128,128,128,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] @@ -1876,9 +1881,11 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 @@ -1904,9 +1911,11 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -3587,10 +3596,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-LABEL: load_i8_stride6_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0] ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 @@ -3634,15 +3645,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,5,11,0,0,0,128,128,128,5,11] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,3,9,15,128,128,0,0,0,3,9,15,128,128] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm3 @@ -3659,17 +3672,19 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,128,4,10,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <2,8,14,128,128,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm6 @@ -3696,14 +3711,16 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm3, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,128,5,11,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <3,9,15,128,128,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 @@ -3720,13 +3737,15 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm6, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm5[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload @@ -3750,8 +3769,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 @@ -3765,20 +3786,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <4,10,128,128,128,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,0,6,12,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] @@ -3800,17 +3823,19 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm7 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <5,11,128,128,128,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,1,7,13,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] ; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] @@ -3828,23 +3853,27 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,128,2,8,14,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,6,12,128,128,128,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 @@ -3898,10 +3927,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,3,9,15,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <1,7,13,128,128,128,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm11 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 @@ -3955,16 +3986,20 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,4,10,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <2,8,14,128,128,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 @@ -3996,16 +4031,20 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,11,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <3,9,15,128,128,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm15 ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm14, %ymm4 @@ -4035,14 +4074,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm11, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,10,128,128,128,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 @@ -4094,15 +4136,18 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <5,11,128,128,128,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 @@ -4113,7 +4158,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm12 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm13[1],xmm12[1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index d12a7d7..c1525cf 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -225,16 +225,16 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6 @@ -616,7 +616,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0] ; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] @@ -642,7 +642,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm10 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] @@ -1431,7 +1431,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[5,12] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm11, %xmm13, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm14 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] @@ -1456,7 +1456,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm5[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[0,7,14] ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm14, %xmm8 @@ -2984,7 +2985,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm13, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,4,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] @@ -2997,12 +2998,12 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm3, %xmm4, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm8 @@ -3089,7 +3090,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] @@ -3116,7 +3117,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] @@ -3127,7 +3128,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,xmm8[2,9,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm4[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[0,7,14] ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11 @@ -3141,7 +3143,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm1, %ymm11 ; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm12 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] @@ -6702,25 +6704,30 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-LABEL: load_i8_stride7_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $728, %rsp # imm = 0x2D8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,6,13,0,0,0,128,128,128,6,13,0,0,0,128] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,5,12,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,128,128,128,5,12,0,0,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,7,14,128,128,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,7,14,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,0,0,0,3,10,128,128,128,0,0,0,3,10,128] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [8,15,0,0,0,128,128,1,8,15,0,0,0,128,128,1] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm8 @@ -6740,18 +6747,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [8,15,128,128,0,0,0,1,8,15,128,128,0,0,0,1] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm14 @@ -6774,20 +6782,22 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [9,128,128,128,0,0,0,2,9,128,128,128,0,0,0,2] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,0,7,14,0,0,0,128,128,0,7,14,0,0,0,128] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm11, %xmm8 @@ -6805,14 +6815,18 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,128,128,128,5,12,0] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,1,8,15,0,0,0,128,128,1,8,15,0,0,0,128] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm10 ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = @@ -6826,17 +6840,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,128,6,13,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [128,128,128,6,13,0,0,0,128,128,128,6,13,0,0,0] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <1,8,15,128,128,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,1,8,15,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [9,128,128,2,9,128,128,2,9,128,128,2,9,128,128,2] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm15, %xmm10 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm10 @@ -6855,15 +6871,17 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm11, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <2,9,128,128,128,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,9,128,128,128,0,0,0,2,9,128,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,0,7,14,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,128,128,0,7,14,0,0,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm11 ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm11, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> @@ -6880,14 +6898,16 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <3,10,128,128,128,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,3,10,128,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,1,8,15,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,128,128,1,8,15,0,0,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,6,13,128,128,6,13,128,128,6,13,128,128,6,13,128] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm11 ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm7 @@ -6901,17 +6921,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,128,128,128,6,13,0] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,1,8,15,128,128,0,0,0,1,8,15,128,128,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,2,9,128,128,2,9,128,128,2,9,128,128,2,9,128] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm8 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 @@ -6930,14 +6952,16 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,2,9,128,128,128,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,0,7,14,0,0,0,128,128,0,7,14,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm3, %xmm11, %xmm3 @@ -6951,15 +6975,17 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,3,10,128,128,128,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,128,128,1,8,15,0,0,0,128,128,1,8,15,0] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm10 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm2, %xmm3, %xmm7 @@ -6973,9 +6999,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm10 ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = @@ -6988,9 +7016,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendvb %xmm2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm10 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vpblendvb %xmm2, (%rsp), %xmm6, %xmm3 # 16-byte Folded Reload @@ -7000,9 +7030,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendvb %xmm2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm9 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm9, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm7, %xmm1, %xmm1 @@ -7012,23 +7044,27 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm4 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,5,12,0,0,0,128,128,128,5,12,0,0,0,128,128] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm5[1,2],xmm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm10 ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm6, %xmm9, %xmm3 @@ -7049,9 +7085,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] @@ -7060,7 +7096,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 @@ -7167,20 +7203,21 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm11 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm12 @@ -7221,19 +7258,22 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,2,9,128,128,128,0,0,0,2,9,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,128,128,0,7,14,0,0,0,128,128,0,7,14,0,0] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm13[7] ; AVX1-ONLY-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,128,128,128,128,128,5,12,0,128,128,128,128,128,5,12] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm13, %ymm3 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload @@ -7262,9 +7302,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm11 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] @@ -7275,7 +7315,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,128,128,1,8,15,0,0,0,128,128,1,8,15,0,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm13 ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 @@ -7326,7 +7367,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u] @@ -7336,12 +7377,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm13 ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,128,128,128,128,0,7,14,0,128,128,128,128,0,7,14] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm11 ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11 @@ -7367,7 +7409,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm10, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm4 @@ -7385,24 +7428,26 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm11 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [14,0,0,0,128,128,0,7,14,0,0,0,128,128,0,7] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm12, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm9 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index 657c353..1afeca0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -240,37 +240,37 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -592,7 +592,7 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 @@ -600,70 +600,70 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm8 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm8 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm9 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm11 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1271,20 +1271,20 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX1-ONLY-LABEL: load_i8_stride8_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 @@ -1292,143 +1292,143 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm9 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm10 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm10 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm10 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm11 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm11 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm11 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3],xmm12[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm11 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm12 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm12 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm14 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3],xmm13[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm14 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm15 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm14 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm14 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -2830,7 +2830,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-LABEL: load_i8_stride8_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $360, %rsp # imm = 0x168 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2839,7 +2839,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm3 @@ -2848,7 +2848,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 @@ -2860,7 +2860,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm9 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm11 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] @@ -2902,23 +2902,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -2951,23 +2951,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm7 @@ -2999,23 +2999,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm8 @@ -3047,25 +3047,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm7 @@ -3096,23 +3096,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -3146,24 +3146,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] @@ -3193,25 +3193,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -7473,7 +7473,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-LABEL: load_i8_stride8_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 @@ -7481,7 +7481,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 @@ -7491,7 +7491,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7499,7 +7499,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7610,23 +7610,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] @@ -7708,24 +7708,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload @@ -7805,26 +7805,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 @@ -7903,25 +7903,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -8002,25 +8002,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14 @@ -8099,26 +8099,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -8197,24 +8197,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 4f2ee5d..43e2f89 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -1428,7 +1428,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1437,7 +1438,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm5 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] @@ -1488,16 +1490,20 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm9[6,u,u,u],zero,xmm9[7,u,u,u],zero,xmm9[8,u,u,u],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6],zero,xmm5[u,u,u,7],zero,xmm5[u,u,u,8],zero,xmm5[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm9 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] @@ -2922,7 +2928,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm0[6,u,u,u],zero,xmm0[7,u,u,u],zero,xmm0[8,u,u,u],zero ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <6,128,u,u,u,7,128,u,u,u,8,128,u,u,u,9> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [6,128,8,128,0,7,128,9,6,128,8,128,0,7,128,9] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] @@ -2932,11 +2939,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] @@ -2956,14 +2965,17 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -3012,7 +3024,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm3[6],zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,xmm3[8],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm14 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] @@ -3020,10 +3033,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] @@ -3124,18 +3139,21 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[12],zero,zero,zero,zero,xmm11[13],zero,zero,zero,zero,xmm11[14],zero,zero,zero,zero,xmm11[15] ; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <128,6,u,u,u,128,7,u,u,u,128,8,u,u,u,128> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,xmm8[7,u,u,u],zero,xmm8[8,u,u,u],zero,xmm8[9,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,7],zero,xmm4[u,u,u,8],zero,xmm4[u,u,u,9],zero,xmm4[u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index ac61b2b..4667d2e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -476,7 +476,8 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[u,u,u,u,7,15],zero,xmm0[u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[6,u,u,u,u],zero,zero,xmm3[7,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,255,255,255,255,0,0,0] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,8,u,u,u],zero,zero,xmm2[1,9,u,u,u],zero,zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,8],zero,zero,xmm1[u,u,u,1,9],zero,zero,xmm1[u,u,u,2,10] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index d71a6f8..69fb583 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -187,11 +187,13 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,3,7,11,15,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,2,6,10,14,3,7,11,15] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,1,5,9,13,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,0,4,8,12,1,5,9,13] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -359,39 +361,47 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,2,10,0,0,3,11,0,0,2,10,0,0,3,11] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <2,10,u,u,3,11,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [2,10,0,0,3,11,0,0,2,10,0,0,3,11,0,0] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,8,0,0,1,9,0,0,0,8,0,0,1,9] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,8,u,u,1,9,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,8,0,0,1,9,0,0,0,8,0,0,1,9,0,0] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,6,14,0,0,7,15,0,0,6,14,0,0,7,15] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <6,14,u,u,7,15,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [6,14,0,0,7,15,0,0,6,14,0,0,7,15,0,0] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,4,12,0,0,5,13,0,0,4,12,0,0,5,13] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,5,13,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [4,12,0,0,5,13,0,0,4,12,0,0,5,13,0,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll index 12d6b9c..3c53d21 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll @@ -16,7 +16,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm6 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm6, %xmm7 @@ -202,7 +202,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm6 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm6, %xmm7 @@ -388,7 +388,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm7 @@ -544,7 +544,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm7 @@ -700,7 +700,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm7 @@ -821,7 +821,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm7 @@ -942,7 +942,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6 @@ -1038,7 +1038,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6 diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index 34ab53e..35af92f 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -567,7 +567,7 @@ define <32 x i8> @mul_v32i8_17(<32 x i8> %a0) nounwind { ; X64-XOP-LABEL: mul_v32i8_17: ; X64-XOP: # %bb.0: ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X64-XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; X64-XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm3 ; X64-XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; X64-XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm2 @@ -910,7 +910,7 @@ define <32 x i8> @mul_v32i8_neg5(<32 x i8> %a0) nounwind { ; X64-XOP-LABEL: mul_v32i8_neg5: ; X64-XOP: # %bb.0: ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X64-XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; X64-XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm3 ; X64-XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; X64-XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll index 8e8aca1..364dc18 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll @@ -197,7 +197,7 @@ define <16 x i8> @ugt_2_v16i8(<16 x i8> %0) { ; ; AVX1-LABEL: ugt_2_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -333,7 +333,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; ; AVX1-LABEL: ult_3_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -341,7 +341,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -473,7 +473,7 @@ define <16 x i8> @ugt_3_v16i8(<16 x i8> %0) { ; ; AVX1-LABEL: ugt_3_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -609,7 +609,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; ; AVX1-LABEL: ult_4_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -617,7 +617,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -749,7 +749,7 @@ define <16 x i8> @ugt_4_v16i8(<16 x i8> %0) { ; ; AVX1-LABEL: ugt_4_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -885,7 +885,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; ; AVX1-LABEL: ult_5_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -893,7 +893,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -1025,7 +1025,7 @@ define <16 x i8> @ugt_5_v16i8(<16 x i8> %0) { ; ; AVX1-LABEL: ugt_5_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1161,7 +1161,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; ; AVX1-LABEL: ult_6_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1169,7 +1169,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -1301,7 +1301,7 @@ define <16 x i8> @ugt_6_v16i8(<16 x i8> %0) { ; ; AVX1-LABEL: ugt_6_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1437,7 +1437,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; ; AVX1-LABEL: ult_7_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1445,7 +1445,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -1704,7 +1704,7 @@ define <8 x i16> @ugt_2_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ugt_2_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1862,7 +1862,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ult_3_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1873,7 +1873,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -2022,7 +2022,7 @@ define <8 x i16> @ugt_3_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ugt_3_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2180,7 +2180,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ult_4_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2191,7 +2191,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -2340,7 +2340,7 @@ define <8 x i16> @ugt_4_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ugt_4_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2498,7 +2498,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ult_5_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2509,7 +2509,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -2658,7 +2658,7 @@ define <8 x i16> @ugt_5_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ugt_5_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2816,7 +2816,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ult_6_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2827,7 +2827,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -2976,7 +2976,7 @@ define <8 x i16> @ugt_6_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ugt_6_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3134,7 +3134,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ult_7_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3145,7 +3145,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -3294,7 +3294,7 @@ define <8 x i16> @ugt_7_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ugt_7_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3452,7 +3452,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ult_8_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3463,7 +3463,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -3612,7 +3612,7 @@ define <8 x i16> @ugt_8_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ugt_8_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3770,7 +3770,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ult_9_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3781,7 +3781,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -3930,7 +3930,7 @@ define <8 x i16> @ugt_9_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ugt_9_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4088,7 +4088,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ult_10_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4099,7 +4099,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -4248,7 +4248,7 @@ define <8 x i16> @ugt_10_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ugt_10_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4406,7 +4406,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ult_11_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4417,7 +4417,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -4566,7 +4566,7 @@ define <8 x i16> @ugt_11_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ugt_11_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4724,7 +4724,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ult_12_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4735,7 +4735,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -4884,7 +4884,7 @@ define <8 x i16> @ugt_12_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ugt_12_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5042,7 +5042,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ult_13_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5053,7 +5053,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -5202,7 +5202,7 @@ define <8 x i16> @ugt_13_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ugt_13_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5360,7 +5360,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ult_14_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5371,7 +5371,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -5520,7 +5520,7 @@ define <8 x i16> @ugt_14_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ugt_14_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5678,7 +5678,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; ; AVX1-LABEL: ult_15_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5689,7 +5689,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -5992,7 +5992,7 @@ define <4 x i32> @ugt_2_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_2_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6180,7 +6180,7 @@ define <4 x i32> @ult_3_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_3_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6194,7 +6194,7 @@ define <4 x i32> @ult_3_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -6366,7 +6366,7 @@ define <4 x i32> @ugt_3_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_3_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6554,7 +6554,7 @@ define <4 x i32> @ult_4_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_4_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6568,7 +6568,7 @@ define <4 x i32> @ult_4_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -6740,7 +6740,7 @@ define <4 x i32> @ugt_4_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_4_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6928,7 +6928,7 @@ define <4 x i32> @ult_5_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_5_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6942,7 +6942,7 @@ define <4 x i32> @ult_5_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -7114,7 +7114,7 @@ define <4 x i32> @ugt_5_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_5_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7302,7 +7302,7 @@ define <4 x i32> @ult_6_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_6_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7316,7 +7316,7 @@ define <4 x i32> @ult_6_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -7488,7 +7488,7 @@ define <4 x i32> @ugt_6_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_6_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7676,7 +7676,7 @@ define <4 x i32> @ult_7_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_7_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7690,7 +7690,7 @@ define <4 x i32> @ult_7_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -7862,7 +7862,7 @@ define <4 x i32> @ugt_7_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_7_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8050,7 +8050,7 @@ define <4 x i32> @ult_8_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_8_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8064,7 +8064,7 @@ define <4 x i32> @ult_8_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -8236,7 +8236,7 @@ define <4 x i32> @ugt_8_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_8_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8424,7 +8424,7 @@ define <4 x i32> @ult_9_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_9_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8438,7 +8438,7 @@ define <4 x i32> @ult_9_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -8610,7 +8610,7 @@ define <4 x i32> @ugt_9_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_9_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8798,7 +8798,7 @@ define <4 x i32> @ult_10_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_10_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8812,7 +8812,7 @@ define <4 x i32> @ult_10_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -8984,7 +8984,7 @@ define <4 x i32> @ugt_10_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_10_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9172,7 +9172,7 @@ define <4 x i32> @ult_11_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_11_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9186,7 +9186,7 @@ define <4 x i32> @ult_11_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -9358,7 +9358,7 @@ define <4 x i32> @ugt_11_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_11_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9546,7 +9546,7 @@ define <4 x i32> @ult_12_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_12_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9560,7 +9560,7 @@ define <4 x i32> @ult_12_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -9732,7 +9732,7 @@ define <4 x i32> @ugt_12_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_12_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9920,7 +9920,7 @@ define <4 x i32> @ult_13_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_13_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9934,7 +9934,7 @@ define <4 x i32> @ult_13_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -10106,7 +10106,7 @@ define <4 x i32> @ugt_13_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_13_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10294,7 +10294,7 @@ define <4 x i32> @ult_14_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_14_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10308,7 +10308,7 @@ define <4 x i32> @ult_14_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -10480,7 +10480,7 @@ define <4 x i32> @ugt_14_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_14_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10668,7 +10668,7 @@ define <4 x i32> @ult_15_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_15_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10682,7 +10682,7 @@ define <4 x i32> @ult_15_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -10854,7 +10854,7 @@ define <4 x i32> @ugt_15_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_15_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11042,7 +11042,7 @@ define <4 x i32> @ult_16_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_16_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11056,7 +11056,7 @@ define <4 x i32> @ult_16_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -11228,7 +11228,7 @@ define <4 x i32> @ugt_16_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_16_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11416,7 +11416,7 @@ define <4 x i32> @ult_17_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_17_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11430,7 +11430,7 @@ define <4 x i32> @ult_17_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17,17,17] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [17,17,17,17] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -11602,7 +11602,7 @@ define <4 x i32> @ugt_17_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_17_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11790,7 +11790,7 @@ define <4 x i32> @ult_18_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_18_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11804,7 +11804,7 @@ define <4 x i32> @ult_18_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18,18,18] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [18,18,18,18] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -11976,7 +11976,7 @@ define <4 x i32> @ugt_18_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_18_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12164,7 +12164,7 @@ define <4 x i32> @ult_19_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_19_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12178,7 +12178,7 @@ define <4 x i32> @ult_19_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19,19,19] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [19,19,19,19] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -12350,7 +12350,7 @@ define <4 x i32> @ugt_19_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_19_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12538,7 +12538,7 @@ define <4 x i32> @ult_20_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_20_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12552,7 +12552,7 @@ define <4 x i32> @ult_20_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20,20,20] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [20,20,20,20] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -12724,7 +12724,7 @@ define <4 x i32> @ugt_20_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_20_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12912,7 +12912,7 @@ define <4 x i32> @ult_21_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_21_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12926,7 +12926,7 @@ define <4 x i32> @ult_21_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21,21,21] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [21,21,21,21] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -13098,7 +13098,7 @@ define <4 x i32> @ugt_21_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_21_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13286,7 +13286,7 @@ define <4 x i32> @ult_22_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_22_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13300,7 +13300,7 @@ define <4 x i32> @ult_22_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22,22,22] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [22,22,22,22] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -13472,7 +13472,7 @@ define <4 x i32> @ugt_22_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_22_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13660,7 +13660,7 @@ define <4 x i32> @ult_23_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_23_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13674,7 +13674,7 @@ define <4 x i32> @ult_23_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23,23,23] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [23,23,23,23] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -13846,7 +13846,7 @@ define <4 x i32> @ugt_23_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_23_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14034,7 +14034,7 @@ define <4 x i32> @ult_24_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_24_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14048,7 +14048,7 @@ define <4 x i32> @ult_24_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24,24,24] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [24,24,24,24] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -14220,7 +14220,7 @@ define <4 x i32> @ugt_24_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_24_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14408,7 +14408,7 @@ define <4 x i32> @ult_25_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_25_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14422,7 +14422,7 @@ define <4 x i32> @ult_25_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25,25,25] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [25,25,25,25] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -14594,7 +14594,7 @@ define <4 x i32> @ugt_25_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_25_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14782,7 +14782,7 @@ define <4 x i32> @ult_26_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_26_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14796,7 +14796,7 @@ define <4 x i32> @ult_26_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26,26,26] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [26,26,26,26] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -14968,7 +14968,7 @@ define <4 x i32> @ugt_26_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_26_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15156,7 +15156,7 @@ define <4 x i32> @ult_27_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_27_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15170,7 +15170,7 @@ define <4 x i32> @ult_27_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27,27,27] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [27,27,27,27] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -15342,7 +15342,7 @@ define <4 x i32> @ugt_27_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_27_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15530,7 +15530,7 @@ define <4 x i32> @ult_28_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_28_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15544,7 +15544,7 @@ define <4 x i32> @ult_28_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28,28,28] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [28,28,28,28] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -15716,7 +15716,7 @@ define <4 x i32> @ugt_28_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_28_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15904,7 +15904,7 @@ define <4 x i32> @ult_29_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_29_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15918,7 +15918,7 @@ define <4 x i32> @ult_29_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29,29,29] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [29,29,29,29] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -16090,7 +16090,7 @@ define <4 x i32> @ugt_29_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_29_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16278,7 +16278,7 @@ define <4 x i32> @ult_30_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_30_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16292,7 +16292,7 @@ define <4 x i32> @ult_30_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30,30,30] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [30,30,30,30] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -16464,7 +16464,7 @@ define <4 x i32> @ugt_30_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ugt_30_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16652,7 +16652,7 @@ define <4 x i32> @ult_31_v4i32(<4 x i32> %0) { ; ; AVX1-LABEL: ult_31_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16666,7 +16666,7 @@ define <4 x i32> @ult_31_v4i32(<4 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31,31,31] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [31,31,31,31] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -17039,7 +17039,7 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_2_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17197,7 +17197,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_3_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17207,7 +17207,8 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [3,3] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -17355,7 +17356,7 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_3_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17513,7 +17514,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_4_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17523,7 +17524,8 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4,4] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -17671,7 +17673,7 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_4_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17829,7 +17831,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_5_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17839,7 +17841,8 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [5,5] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -17987,7 +17990,7 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_5_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18145,7 +18148,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_6_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18155,7 +18158,8 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [6,6] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -18303,7 +18307,7 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_6_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18461,7 +18465,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_7_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18471,7 +18475,8 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [7,7] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -18619,7 +18624,7 @@ define <2 x i64> @ugt_7_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_7_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18777,7 +18782,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_8_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18787,7 +18792,8 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [8,8] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -18935,7 +18941,7 @@ define <2 x i64> @ugt_8_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_8_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19093,7 +19099,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_9_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19103,7 +19109,8 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9,9] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -19251,7 +19258,7 @@ define <2 x i64> @ugt_9_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_9_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19409,7 +19416,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_10_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19419,7 +19426,8 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [10,10] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -19567,7 +19575,7 @@ define <2 x i64> @ugt_10_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_10_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19725,7 +19733,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_11_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19735,7 +19743,8 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [11,11] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -19883,7 +19892,7 @@ define <2 x i64> @ugt_11_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_11_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20041,7 +20050,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_12_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20051,7 +20060,8 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [12,12] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -20199,7 +20209,7 @@ define <2 x i64> @ugt_12_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_12_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20357,7 +20367,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_13_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20367,7 +20377,8 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [13,13] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -20515,7 +20526,7 @@ define <2 x i64> @ugt_13_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_13_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20673,7 +20684,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_14_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20683,7 +20694,8 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [14,14] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -20831,7 +20843,7 @@ define <2 x i64> @ugt_14_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_14_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20989,7 +21001,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_15_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20999,7 +21011,8 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [15,15] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -21147,7 +21160,7 @@ define <2 x i64> @ugt_15_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_15_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21305,7 +21318,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_16_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21315,7 +21328,8 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [16,16] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -21463,7 +21477,7 @@ define <2 x i64> @ugt_16_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_16_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21621,7 +21635,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_17_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21631,7 +21645,8 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [17,17] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -21779,7 +21794,7 @@ define <2 x i64> @ugt_17_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_17_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21937,7 +21952,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_18_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21947,7 +21962,8 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18,18] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -22095,7 +22111,7 @@ define <2 x i64> @ugt_18_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_18_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22253,7 +22269,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_19_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22263,7 +22279,8 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [19,19] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -22411,7 +22428,7 @@ define <2 x i64> @ugt_19_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_19_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22569,7 +22586,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_20_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22579,7 +22596,8 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [20,20] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -22727,7 +22745,7 @@ define <2 x i64> @ugt_20_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_20_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22885,7 +22903,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_21_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22895,7 +22913,8 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [21,21] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -23043,7 +23062,7 @@ define <2 x i64> @ugt_21_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_21_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23201,7 +23220,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_22_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23211,7 +23230,8 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [22,22] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -23359,7 +23379,7 @@ define <2 x i64> @ugt_22_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_22_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23517,7 +23537,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_23_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23527,7 +23547,8 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [23,23] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -23675,7 +23696,7 @@ define <2 x i64> @ugt_23_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_23_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23833,7 +23854,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_24_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23843,7 +23864,8 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [24,24] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -23991,7 +24013,7 @@ define <2 x i64> @ugt_24_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_24_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24149,7 +24171,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_25_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24159,7 +24181,8 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [25,25] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -24307,7 +24330,7 @@ define <2 x i64> @ugt_25_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_25_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24465,7 +24488,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_26_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24475,7 +24498,8 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [26,26] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -24623,7 +24647,7 @@ define <2 x i64> @ugt_26_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_26_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24781,7 +24805,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_27_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24791,7 +24815,8 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [27,27] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -24939,7 +24964,7 @@ define <2 x i64> @ugt_27_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_27_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25097,7 +25122,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_28_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25107,7 +25132,8 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [28,28] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -25255,7 +25281,7 @@ define <2 x i64> @ugt_28_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_28_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25413,7 +25439,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_29_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25423,7 +25449,8 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [29,29] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -25571,7 +25598,7 @@ define <2 x i64> @ugt_29_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_29_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25729,7 +25756,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_30_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25739,7 +25766,8 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [30,30] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -25887,7 +25915,7 @@ define <2 x i64> @ugt_30_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_30_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26045,7 +26073,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_31_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26055,7 +26083,8 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [31,31] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -26203,7 +26232,7 @@ define <2 x i64> @ugt_31_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_31_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26361,7 +26390,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_32_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26371,7 +26400,8 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32,32] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -26519,7 +26549,7 @@ define <2 x i64> @ugt_32_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_32_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26677,7 +26707,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_33_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26687,7 +26717,8 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [33,33] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -26835,7 +26866,7 @@ define <2 x i64> @ugt_33_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_33_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26993,7 +27024,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_34_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27003,7 +27034,8 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [34,34] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -27151,7 +27183,7 @@ define <2 x i64> @ugt_34_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_34_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27309,7 +27341,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_35_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27319,7 +27351,8 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [35,35] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -27467,7 +27500,7 @@ define <2 x i64> @ugt_35_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_35_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27625,7 +27658,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_36_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27635,7 +27668,8 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [36,36] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -27783,7 +27817,7 @@ define <2 x i64> @ugt_36_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_36_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27941,7 +27975,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_37_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27951,7 +27985,8 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [37,37] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -28099,7 +28134,7 @@ define <2 x i64> @ugt_37_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_37_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28257,7 +28292,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_38_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28267,7 +28302,8 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [38,38] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -28415,7 +28451,7 @@ define <2 x i64> @ugt_38_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_38_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28573,7 +28609,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_39_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28583,7 +28619,8 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [39,39] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -28731,7 +28768,7 @@ define <2 x i64> @ugt_39_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_39_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28889,7 +28926,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_40_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28899,7 +28936,8 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [40,40] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -29047,7 +29085,7 @@ define <2 x i64> @ugt_40_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_40_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29205,7 +29243,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_41_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29215,7 +29253,8 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [41,41] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -29363,7 +29402,7 @@ define <2 x i64> @ugt_41_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_41_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29521,7 +29560,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_42_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29531,7 +29570,8 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [42,42] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -29679,7 +29719,7 @@ define <2 x i64> @ugt_42_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_42_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29837,7 +29877,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_43_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29847,7 +29887,8 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [43,43] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -29995,7 +30036,7 @@ define <2 x i64> @ugt_43_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_43_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30153,7 +30194,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_44_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30163,7 +30204,8 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [44,44] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -30311,7 +30353,7 @@ define <2 x i64> @ugt_44_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_44_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30469,7 +30511,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_45_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30479,7 +30521,8 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [45,45] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -30627,7 +30670,7 @@ define <2 x i64> @ugt_45_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_45_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30785,7 +30828,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_46_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30795,7 +30838,8 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [46,46] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -30943,7 +30987,7 @@ define <2 x i64> @ugt_46_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_46_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31101,7 +31145,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_47_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31111,7 +31155,8 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [47,47] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -31259,7 +31304,7 @@ define <2 x i64> @ugt_47_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_47_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31417,7 +31462,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_48_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31427,7 +31472,8 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [48,48] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -31575,7 +31621,7 @@ define <2 x i64> @ugt_48_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_48_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31733,7 +31779,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_49_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31743,7 +31789,8 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [49,49] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -31891,7 +31938,7 @@ define <2 x i64> @ugt_49_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_49_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32049,7 +32096,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_50_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32059,7 +32106,8 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [50,50] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -32207,7 +32255,7 @@ define <2 x i64> @ugt_50_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_50_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32365,7 +32413,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_51_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32375,7 +32423,8 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [51,51] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -32523,7 +32572,7 @@ define <2 x i64> @ugt_51_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_51_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32681,7 +32730,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_52_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32691,7 +32740,8 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [52,52] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -32839,7 +32889,7 @@ define <2 x i64> @ugt_52_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_52_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32997,7 +33047,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_53_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33007,7 +33057,8 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [53,53] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -33155,7 +33206,7 @@ define <2 x i64> @ugt_53_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_53_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33313,7 +33364,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_54_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33323,7 +33374,8 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [54,54] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -33471,7 +33523,7 @@ define <2 x i64> @ugt_54_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_54_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33629,7 +33681,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_55_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33639,7 +33691,8 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [55,55] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -33787,7 +33840,7 @@ define <2 x i64> @ugt_55_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_55_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33945,7 +33998,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_56_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33955,7 +34008,8 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [56,56] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -34103,7 +34157,7 @@ define <2 x i64> @ugt_56_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_56_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34261,7 +34315,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_57_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34271,7 +34325,8 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [57,57] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -34419,7 +34474,7 @@ define <2 x i64> @ugt_57_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_57_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34577,7 +34632,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_58_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34587,7 +34642,8 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [58,58] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -34735,7 +34791,7 @@ define <2 x i64> @ugt_58_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_58_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34893,7 +34949,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_59_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34903,7 +34959,8 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [59,59] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -35051,7 +35108,7 @@ define <2 x i64> @ugt_59_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_59_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35209,7 +35266,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_60_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35219,7 +35276,8 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [60,60] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -35367,7 +35425,7 @@ define <2 x i64> @ugt_60_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_60_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35525,7 +35583,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_61_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35535,7 +35593,8 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [61,61] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -35683,7 +35742,7 @@ define <2 x i64> @ugt_61_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_61_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35841,7 +35900,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_62_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35851,7 +35910,8 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [62,62] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -35999,7 +36059,7 @@ define <2 x i64> @ugt_62_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ugt_62_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -36157,7 +36217,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; ; AVX1-LABEL: ult_63_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -36167,7 +36227,8 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [63,63] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll index 2179214..61f0885 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -86,7 +86,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; ; AVX1-LABEL: testv2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -251,7 +251,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; AVX1-LABEL: testv4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -424,7 +424,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; AVX1-LABEL: testv8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -567,7 +567,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; ; AVX1-LABEL: testv16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll index fad3eff..c1a248f 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll @@ -138,7 +138,7 @@ define <32 x i8> @ult_2_v32i8(<32 x i8> %0) { define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ugt_2_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -153,7 +153,7 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -222,7 +222,7 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) { define <32 x i8> @ult_3_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ult_3_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -237,7 +237,7 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -311,7 +311,7 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) { define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ugt_3_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -326,7 +326,7 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -395,7 +395,7 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) { define <32 x i8> @ult_4_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ult_4_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -410,7 +410,7 @@ define <32 x i8> @ult_4_v32i8(<32 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -484,7 +484,7 @@ define <32 x i8> @ult_4_v32i8(<32 x i8> %0) { define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ugt_4_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -499,7 +499,7 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -568,7 +568,7 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) { define <32 x i8> @ult_5_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ult_5_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -583,7 +583,7 @@ define <32 x i8> @ult_5_v32i8(<32 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -657,7 +657,7 @@ define <32 x i8> @ult_5_v32i8(<32 x i8> %0) { define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ugt_5_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -672,7 +672,7 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -741,7 +741,7 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) { define <32 x i8> @ult_6_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ult_6_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -756,7 +756,7 @@ define <32 x i8> @ult_6_v32i8(<32 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -830,7 +830,7 @@ define <32 x i8> @ult_6_v32i8(<32 x i8> %0) { define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ugt_6_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -845,7 +845,7 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -914,7 +914,7 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) { define <32 x i8> @ult_7_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ult_7_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -929,7 +929,7 @@ define <32 x i8> @ult_7_v32i8(<32 x i8> %0) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -1131,7 +1131,7 @@ define <16 x i16> @ult_2_v16i16(<16 x i16> %0) { define <16 x i16> @ugt_2_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_2_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1152,7 +1152,7 @@ define <16 x i16> @ugt_2_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -1212,7 +1212,7 @@ define <16 x i16> @ugt_2_v16i16(<16 x i16> %0) { define <16 x i16> @ult_3_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_3_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1233,7 +1233,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -1298,7 +1298,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) { define <16 x i16> @ugt_3_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_3_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1319,7 +1319,7 @@ define <16 x i16> @ugt_3_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -1379,7 +1379,7 @@ define <16 x i16> @ugt_3_v16i16(<16 x i16> %0) { define <16 x i16> @ult_4_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_4_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1400,7 +1400,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -1465,7 +1465,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) { define <16 x i16> @ugt_4_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_4_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1486,7 +1486,7 @@ define <16 x i16> @ugt_4_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -1546,7 +1546,7 @@ define <16 x i16> @ugt_4_v16i16(<16 x i16> %0) { define <16 x i16> @ult_5_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_5_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1567,7 +1567,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -1632,7 +1632,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) { define <16 x i16> @ugt_5_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_5_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1653,7 +1653,7 @@ define <16 x i16> @ugt_5_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -1713,7 +1713,7 @@ define <16 x i16> @ugt_5_v16i16(<16 x i16> %0) { define <16 x i16> @ult_6_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_6_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1734,7 +1734,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -1799,7 +1799,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) { define <16 x i16> @ugt_6_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_6_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1820,7 +1820,7 @@ define <16 x i16> @ugt_6_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -1880,7 +1880,7 @@ define <16 x i16> @ugt_6_v16i16(<16 x i16> %0) { define <16 x i16> @ult_7_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_7_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1901,7 +1901,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -1966,7 +1966,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) { define <16 x i16> @ugt_7_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_7_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1987,7 +1987,7 @@ define <16 x i16> @ugt_7_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -2047,7 +2047,7 @@ define <16 x i16> @ugt_7_v16i16(<16 x i16> %0) { define <16 x i16> @ult_8_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_8_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2068,7 +2068,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -2133,7 +2133,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) { define <16 x i16> @ugt_8_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_8_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2154,7 +2154,7 @@ define <16 x i16> @ugt_8_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -2214,7 +2214,7 @@ define <16 x i16> @ugt_8_v16i16(<16 x i16> %0) { define <16 x i16> @ult_9_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_9_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2235,7 +2235,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -2300,7 +2300,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) { define <16 x i16> @ugt_9_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_9_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2321,7 +2321,7 @@ define <16 x i16> @ugt_9_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -2381,7 +2381,7 @@ define <16 x i16> @ugt_9_v16i16(<16 x i16> %0) { define <16 x i16> @ult_10_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_10_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2402,7 +2402,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -2467,7 +2467,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) { define <16 x i16> @ugt_10_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_10_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2488,7 +2488,7 @@ define <16 x i16> @ugt_10_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -2548,7 +2548,7 @@ define <16 x i16> @ugt_10_v16i16(<16 x i16> %0) { define <16 x i16> @ult_11_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_11_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2569,7 +2569,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -2634,7 +2634,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) { define <16 x i16> @ugt_11_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_11_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2655,7 +2655,7 @@ define <16 x i16> @ugt_11_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -2715,7 +2715,7 @@ define <16 x i16> @ugt_11_v16i16(<16 x i16> %0) { define <16 x i16> @ult_12_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_12_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2736,7 +2736,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -2801,7 +2801,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) { define <16 x i16> @ugt_12_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_12_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2822,7 +2822,7 @@ define <16 x i16> @ugt_12_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -2882,7 +2882,7 @@ define <16 x i16> @ugt_12_v16i16(<16 x i16> %0) { define <16 x i16> @ult_13_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_13_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2903,7 +2903,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -2968,7 +2968,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) { define <16 x i16> @ugt_13_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_13_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2989,7 +2989,7 @@ define <16 x i16> @ugt_13_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -3049,7 +3049,7 @@ define <16 x i16> @ugt_13_v16i16(<16 x i16> %0) { define <16 x i16> @ult_14_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_14_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3070,7 +3070,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -3135,7 +3135,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) { define <16 x i16> @ugt_14_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_14_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3156,7 +3156,7 @@ define <16 x i16> @ugt_14_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -3216,7 +3216,7 @@ define <16 x i16> @ugt_14_v16i16(<16 x i16> %0) { define <16 x i16> @ult_15_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_15_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3237,7 +3237,7 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) { ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -3432,7 +3432,7 @@ define <8 x i32> @ult_2_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_2_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3458,7 +3458,7 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -3535,7 +3535,7 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) { define <8 x i32> @ult_3_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_3_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3561,7 +3561,7 @@ define <8 x i32> @ult_3_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -3638,7 +3638,7 @@ define <8 x i32> @ult_3_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_3_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3664,7 +3664,7 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -3741,7 +3741,7 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) { define <8 x i32> @ult_4_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_4_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3767,7 +3767,7 @@ define <8 x i32> @ult_4_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -3844,7 +3844,7 @@ define <8 x i32> @ult_4_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_4_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3870,7 +3870,7 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -3947,7 +3947,7 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) { define <8 x i32> @ult_5_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_5_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3973,7 +3973,7 @@ define <8 x i32> @ult_5_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -4050,7 +4050,7 @@ define <8 x i32> @ult_5_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_5_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4076,7 +4076,7 @@ define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -4153,7 +4153,7 @@ define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) { define <8 x i32> @ult_6_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_6_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4179,7 +4179,7 @@ define <8 x i32> @ult_6_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -4256,7 +4256,7 @@ define <8 x i32> @ult_6_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_6_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4282,7 +4282,7 @@ define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -4359,7 +4359,7 @@ define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) { define <8 x i32> @ult_7_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_7_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4385,7 +4385,7 @@ define <8 x i32> @ult_7_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -4462,7 +4462,7 @@ define <8 x i32> @ult_7_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_7_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4488,7 +4488,7 @@ define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -4565,7 +4565,7 @@ define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) { define <8 x i32> @ult_8_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_8_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4591,7 +4591,7 @@ define <8 x i32> @ult_8_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -4668,7 +4668,7 @@ define <8 x i32> @ult_8_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_8_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4694,7 +4694,7 @@ define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -4771,7 +4771,7 @@ define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) { define <8 x i32> @ult_9_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_9_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4797,7 +4797,7 @@ define <8 x i32> @ult_9_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -4874,7 +4874,7 @@ define <8 x i32> @ult_9_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_9_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4900,7 +4900,7 @@ define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -4977,7 +4977,7 @@ define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) { define <8 x i32> @ult_10_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_10_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5003,7 +5003,7 @@ define <8 x i32> @ult_10_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5080,7 +5080,7 @@ define <8 x i32> @ult_10_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_10_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5106,7 +5106,7 @@ define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5183,7 +5183,7 @@ define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) { define <8 x i32> @ult_11_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_11_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5209,7 +5209,7 @@ define <8 x i32> @ult_11_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5286,7 +5286,7 @@ define <8 x i32> @ult_11_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_11_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5312,7 +5312,7 @@ define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5389,7 +5389,7 @@ define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) { define <8 x i32> @ult_12_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_12_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5415,7 +5415,7 @@ define <8 x i32> @ult_12_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5492,7 +5492,7 @@ define <8 x i32> @ult_12_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_12_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5518,7 +5518,7 @@ define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5595,7 +5595,7 @@ define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) { define <8 x i32> @ult_13_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_13_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5621,7 +5621,7 @@ define <8 x i32> @ult_13_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5698,7 +5698,7 @@ define <8 x i32> @ult_13_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_13_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5724,7 +5724,7 @@ define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5801,7 +5801,7 @@ define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) { define <8 x i32> @ult_14_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_14_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5827,7 +5827,7 @@ define <8 x i32> @ult_14_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5904,7 +5904,7 @@ define <8 x i32> @ult_14_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_14_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5930,7 +5930,7 @@ define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -6007,7 +6007,7 @@ define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) { define <8 x i32> @ult_15_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_15_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6033,7 +6033,7 @@ define <8 x i32> @ult_15_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -6110,7 +6110,7 @@ define <8 x i32> @ult_15_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_15_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6136,7 +6136,7 @@ define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -6213,7 +6213,7 @@ define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) { define <8 x i32> @ult_16_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_16_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6239,7 +6239,7 @@ define <8 x i32> @ult_16_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -6316,7 +6316,7 @@ define <8 x i32> @ult_16_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_16_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6342,7 +6342,7 @@ define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -6419,7 +6419,7 @@ define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) { define <8 x i32> @ult_17_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_17_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6445,7 +6445,7 @@ define <8 x i32> @ult_17_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17,17,17] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [17,17,17,17] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -6522,7 +6522,7 @@ define <8 x i32> @ult_17_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_17_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6548,7 +6548,7 @@ define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17,17,17] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [17,17,17,17] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -6625,7 +6625,7 @@ define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) { define <8 x i32> @ult_18_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_18_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6651,7 +6651,7 @@ define <8 x i32> @ult_18_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18,18,18] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [18,18,18,18] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -6728,7 +6728,7 @@ define <8 x i32> @ult_18_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_18_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6754,7 +6754,7 @@ define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18,18,18] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [18,18,18,18] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -6831,7 +6831,7 @@ define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) { define <8 x i32> @ult_19_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_19_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6857,7 +6857,7 @@ define <8 x i32> @ult_19_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19,19,19] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [19,19,19,19] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -6934,7 +6934,7 @@ define <8 x i32> @ult_19_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_19_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6960,7 +6960,7 @@ define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19,19,19] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [19,19,19,19] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -7037,7 +7037,7 @@ define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) { define <8 x i32> @ult_20_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_20_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7063,7 +7063,7 @@ define <8 x i32> @ult_20_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20,20,20] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [20,20,20,20] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -7140,7 +7140,7 @@ define <8 x i32> @ult_20_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_20_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7166,7 +7166,7 @@ define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20,20,20] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [20,20,20,20] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -7243,7 +7243,7 @@ define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) { define <8 x i32> @ult_21_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_21_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7269,7 +7269,7 @@ define <8 x i32> @ult_21_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21,21,21] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [21,21,21,21] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -7346,7 +7346,7 @@ define <8 x i32> @ult_21_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_21_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7372,7 +7372,7 @@ define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21,21,21] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [21,21,21,21] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -7449,7 +7449,7 @@ define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) { define <8 x i32> @ult_22_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_22_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7475,7 +7475,7 @@ define <8 x i32> @ult_22_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22,22,22] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [22,22,22,22] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -7552,7 +7552,7 @@ define <8 x i32> @ult_22_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_22_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7578,7 +7578,7 @@ define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22,22,22] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [22,22,22,22] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -7655,7 +7655,7 @@ define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) { define <8 x i32> @ult_23_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_23_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7681,7 +7681,7 @@ define <8 x i32> @ult_23_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23,23,23] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [23,23,23,23] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -7758,7 +7758,7 @@ define <8 x i32> @ult_23_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_23_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7784,7 +7784,7 @@ define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23,23,23] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [23,23,23,23] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -7861,7 +7861,7 @@ define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) { define <8 x i32> @ult_24_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_24_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7887,7 +7887,7 @@ define <8 x i32> @ult_24_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24,24,24] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [24,24,24,24] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -7964,7 +7964,7 @@ define <8 x i32> @ult_24_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_24_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7990,7 +7990,7 @@ define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24,24,24] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [24,24,24,24] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -8067,7 +8067,7 @@ define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) { define <8 x i32> @ult_25_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_25_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8093,7 +8093,7 @@ define <8 x i32> @ult_25_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25,25,25] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [25,25,25,25] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -8170,7 +8170,7 @@ define <8 x i32> @ult_25_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_25_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8196,7 +8196,7 @@ define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25,25,25] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [25,25,25,25] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -8273,7 +8273,7 @@ define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) { define <8 x i32> @ult_26_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_26_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8299,7 +8299,7 @@ define <8 x i32> @ult_26_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26,26,26] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [26,26,26,26] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -8376,7 +8376,7 @@ define <8 x i32> @ult_26_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_26_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8402,7 +8402,7 @@ define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26,26,26] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [26,26,26,26] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -8479,7 +8479,7 @@ define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) { define <8 x i32> @ult_27_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_27_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8505,7 +8505,7 @@ define <8 x i32> @ult_27_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27,27,27] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [27,27,27,27] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -8582,7 +8582,7 @@ define <8 x i32> @ult_27_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_27_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8608,7 +8608,7 @@ define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27,27,27] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [27,27,27,27] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -8685,7 +8685,7 @@ define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) { define <8 x i32> @ult_28_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_28_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8711,7 +8711,7 @@ define <8 x i32> @ult_28_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28,28,28] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [28,28,28,28] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -8788,7 +8788,7 @@ define <8 x i32> @ult_28_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_28_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8814,7 +8814,7 @@ define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28,28,28] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [28,28,28,28] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -8891,7 +8891,7 @@ define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) { define <8 x i32> @ult_29_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_29_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8917,7 +8917,7 @@ define <8 x i32> @ult_29_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29,29,29] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [29,29,29,29] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -8994,7 +8994,7 @@ define <8 x i32> @ult_29_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_29_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9020,7 +9020,7 @@ define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29,29,29] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [29,29,29,29] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9097,7 +9097,7 @@ define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) { define <8 x i32> @ult_30_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_30_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9123,7 +9123,7 @@ define <8 x i32> @ult_30_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30,30,30] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [30,30,30,30] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9200,7 +9200,7 @@ define <8 x i32> @ult_30_v8i32(<8 x i32> %0) { define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_30_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9226,7 +9226,7 @@ define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30,30,30] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [30,30,30,30] ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9303,7 +9303,7 @@ define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) { define <8 x i32> @ult_31_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_31_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9329,7 +9329,7 @@ define <8 x i32> @ult_31_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31,31,31] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [31,31,31,31] ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9536,7 +9536,7 @@ define <4 x i64> @ult_2_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_2_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9554,7 +9554,8 @@ define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [2,2] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9619,7 +9620,7 @@ define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) { define <4 x i64> @ult_3_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_3_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9637,7 +9638,8 @@ define <4 x i64> @ult_3_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [3,3] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9702,7 +9704,7 @@ define <4 x i64> @ult_3_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_3_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9720,7 +9722,8 @@ define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [3,3] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9785,7 +9788,7 @@ define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) { define <4 x i64> @ult_4_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_4_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9803,7 +9806,8 @@ define <4 x i64> @ult_4_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4,4] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9868,7 +9872,7 @@ define <4 x i64> @ult_4_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_4_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9886,7 +9890,8 @@ define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4,4] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9951,7 +9956,7 @@ define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) { define <4 x i64> @ult_5_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_5_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9969,7 +9974,8 @@ define <4 x i64> @ult_5_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [5,5] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10034,7 +10040,7 @@ define <4 x i64> @ult_5_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_5_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_5_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10052,7 +10058,8 @@ define <4 x i64> @ugt_5_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [5,5] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10117,7 +10124,7 @@ define <4 x i64> @ugt_5_v4i64(<4 x i64> %0) { define <4 x i64> @ult_6_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_6_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10135,7 +10142,8 @@ define <4 x i64> @ult_6_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [6,6] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10200,7 +10208,7 @@ define <4 x i64> @ult_6_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_6_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_6_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10218,7 +10226,8 @@ define <4 x i64> @ugt_6_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [6,6] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10283,7 +10292,7 @@ define <4 x i64> @ugt_6_v4i64(<4 x i64> %0) { define <4 x i64> @ult_7_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_7_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10301,7 +10310,8 @@ define <4 x i64> @ult_7_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [7,7] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10366,7 +10376,7 @@ define <4 x i64> @ult_7_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_7_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_7_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10384,7 +10394,8 @@ define <4 x i64> @ugt_7_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [7,7] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10449,7 +10460,7 @@ define <4 x i64> @ugt_7_v4i64(<4 x i64> %0) { define <4 x i64> @ult_8_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_8_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10467,7 +10478,8 @@ define <4 x i64> @ult_8_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [8,8] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10532,7 +10544,7 @@ define <4 x i64> @ult_8_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_8_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_8_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10550,7 +10562,8 @@ define <4 x i64> @ugt_8_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [8,8] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10615,7 +10628,7 @@ define <4 x i64> @ugt_8_v4i64(<4 x i64> %0) { define <4 x i64> @ult_9_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_9_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10633,7 +10646,8 @@ define <4 x i64> @ult_9_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9,9] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10698,7 +10712,7 @@ define <4 x i64> @ult_9_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_9_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_9_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10716,7 +10730,8 @@ define <4 x i64> @ugt_9_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9,9] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10781,7 +10796,7 @@ define <4 x i64> @ugt_9_v4i64(<4 x i64> %0) { define <4 x i64> @ult_10_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_10_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10799,7 +10814,8 @@ define <4 x i64> @ult_10_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [10,10] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10864,7 +10880,7 @@ define <4 x i64> @ult_10_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_10_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_10_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10882,7 +10898,8 @@ define <4 x i64> @ugt_10_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [10,10] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10947,7 +10964,7 @@ define <4 x i64> @ugt_10_v4i64(<4 x i64> %0) { define <4 x i64> @ult_11_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_11_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10965,7 +10982,8 @@ define <4 x i64> @ult_11_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [11,11] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11030,7 +11048,7 @@ define <4 x i64> @ult_11_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_11_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_11_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11048,7 +11066,8 @@ define <4 x i64> @ugt_11_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [11,11] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11113,7 +11132,7 @@ define <4 x i64> @ugt_11_v4i64(<4 x i64> %0) { define <4 x i64> @ult_12_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_12_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11131,7 +11150,8 @@ define <4 x i64> @ult_12_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [12,12] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11196,7 +11216,7 @@ define <4 x i64> @ult_12_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_12_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_12_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11214,7 +11234,8 @@ define <4 x i64> @ugt_12_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [12,12] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11279,7 +11300,7 @@ define <4 x i64> @ugt_12_v4i64(<4 x i64> %0) { define <4 x i64> @ult_13_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_13_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11297,7 +11318,8 @@ define <4 x i64> @ult_13_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [13,13] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11362,7 +11384,7 @@ define <4 x i64> @ult_13_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_13_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_13_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11380,7 +11402,8 @@ define <4 x i64> @ugt_13_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [13,13] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11445,7 +11468,7 @@ define <4 x i64> @ugt_13_v4i64(<4 x i64> %0) { define <4 x i64> @ult_14_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_14_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11463,7 +11486,8 @@ define <4 x i64> @ult_14_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [14,14] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11528,7 +11552,7 @@ define <4 x i64> @ult_14_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_14_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_14_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11546,7 +11570,8 @@ define <4 x i64> @ugt_14_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [14,14] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11611,7 +11636,7 @@ define <4 x i64> @ugt_14_v4i64(<4 x i64> %0) { define <4 x i64> @ult_15_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_15_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11629,7 +11654,8 @@ define <4 x i64> @ult_15_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [15,15] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11694,7 +11720,7 @@ define <4 x i64> @ult_15_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_15_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_15_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11712,7 +11738,8 @@ define <4 x i64> @ugt_15_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [15,15] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11777,7 +11804,7 @@ define <4 x i64> @ugt_15_v4i64(<4 x i64> %0) { define <4 x i64> @ult_16_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_16_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11795,7 +11822,8 @@ define <4 x i64> @ult_16_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [16,16] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11860,7 +11888,7 @@ define <4 x i64> @ult_16_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_16_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_16_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11878,7 +11906,8 @@ define <4 x i64> @ugt_16_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [16,16] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11943,7 +11972,7 @@ define <4 x i64> @ugt_16_v4i64(<4 x i64> %0) { define <4 x i64> @ult_17_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_17_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11961,7 +11990,8 @@ define <4 x i64> @ult_17_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [17,17] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12026,7 +12056,7 @@ define <4 x i64> @ult_17_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_17_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_17_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12044,7 +12074,8 @@ define <4 x i64> @ugt_17_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [17,17] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12109,7 +12140,7 @@ define <4 x i64> @ugt_17_v4i64(<4 x i64> %0) { define <4 x i64> @ult_18_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_18_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12127,7 +12158,8 @@ define <4 x i64> @ult_18_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18,18] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12192,7 +12224,7 @@ define <4 x i64> @ult_18_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_18_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_18_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12210,7 +12242,8 @@ define <4 x i64> @ugt_18_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18,18] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12275,7 +12308,7 @@ define <4 x i64> @ugt_18_v4i64(<4 x i64> %0) { define <4 x i64> @ult_19_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_19_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12293,7 +12326,8 @@ define <4 x i64> @ult_19_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [19,19] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12358,7 +12392,7 @@ define <4 x i64> @ult_19_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_19_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_19_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12376,7 +12410,8 @@ define <4 x i64> @ugt_19_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [19,19] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12441,7 +12476,7 @@ define <4 x i64> @ugt_19_v4i64(<4 x i64> %0) { define <4 x i64> @ult_20_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_20_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12459,7 +12494,8 @@ define <4 x i64> @ult_20_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [20,20] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12524,7 +12560,7 @@ define <4 x i64> @ult_20_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_20_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_20_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12542,7 +12578,8 @@ define <4 x i64> @ugt_20_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [20,20] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12607,7 +12644,7 @@ define <4 x i64> @ugt_20_v4i64(<4 x i64> %0) { define <4 x i64> @ult_21_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_21_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12625,7 +12662,8 @@ define <4 x i64> @ult_21_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [21,21] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12690,7 +12728,7 @@ define <4 x i64> @ult_21_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_21_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_21_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12708,7 +12746,8 @@ define <4 x i64> @ugt_21_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [21,21] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12773,7 +12812,7 @@ define <4 x i64> @ugt_21_v4i64(<4 x i64> %0) { define <4 x i64> @ult_22_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_22_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12791,7 +12830,8 @@ define <4 x i64> @ult_22_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [22,22] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12856,7 +12896,7 @@ define <4 x i64> @ult_22_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_22_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_22_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12874,7 +12914,8 @@ define <4 x i64> @ugt_22_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [22,22] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12939,7 +12980,7 @@ define <4 x i64> @ugt_22_v4i64(<4 x i64> %0) { define <4 x i64> @ult_23_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_23_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12957,7 +12998,8 @@ define <4 x i64> @ult_23_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [23,23] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13022,7 +13064,7 @@ define <4 x i64> @ult_23_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_23_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_23_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13040,7 +13082,8 @@ define <4 x i64> @ugt_23_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [23,23] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13105,7 +13148,7 @@ define <4 x i64> @ugt_23_v4i64(<4 x i64> %0) { define <4 x i64> @ult_24_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_24_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13123,7 +13166,8 @@ define <4 x i64> @ult_24_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [24,24] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13188,7 +13232,7 @@ define <4 x i64> @ult_24_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_24_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_24_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13206,7 +13250,8 @@ define <4 x i64> @ugt_24_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [24,24] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13271,7 +13316,7 @@ define <4 x i64> @ugt_24_v4i64(<4 x i64> %0) { define <4 x i64> @ult_25_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_25_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13289,7 +13334,8 @@ define <4 x i64> @ult_25_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [25,25] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13354,7 +13400,7 @@ define <4 x i64> @ult_25_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_25_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_25_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13372,7 +13418,8 @@ define <4 x i64> @ugt_25_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [25,25] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13437,7 +13484,7 @@ define <4 x i64> @ugt_25_v4i64(<4 x i64> %0) { define <4 x i64> @ult_26_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_26_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13455,7 +13502,8 @@ define <4 x i64> @ult_26_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [26,26] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13520,7 +13568,7 @@ define <4 x i64> @ult_26_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_26_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_26_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13538,7 +13586,8 @@ define <4 x i64> @ugt_26_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [26,26] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13603,7 +13652,7 @@ define <4 x i64> @ugt_26_v4i64(<4 x i64> %0) { define <4 x i64> @ult_27_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_27_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13621,7 +13670,8 @@ define <4 x i64> @ult_27_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [27,27] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13686,7 +13736,7 @@ define <4 x i64> @ult_27_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_27_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_27_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13704,7 +13754,8 @@ define <4 x i64> @ugt_27_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [27,27] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13769,7 +13820,7 @@ define <4 x i64> @ugt_27_v4i64(<4 x i64> %0) { define <4 x i64> @ult_28_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_28_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13787,7 +13838,8 @@ define <4 x i64> @ult_28_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [28,28] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13852,7 +13904,7 @@ define <4 x i64> @ult_28_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_28_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_28_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13870,7 +13922,8 @@ define <4 x i64> @ugt_28_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [28,28] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13935,7 +13988,7 @@ define <4 x i64> @ugt_28_v4i64(<4 x i64> %0) { define <4 x i64> @ult_29_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_29_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13953,7 +14006,8 @@ define <4 x i64> @ult_29_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [29,29] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14018,7 +14072,7 @@ define <4 x i64> @ult_29_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_29_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_29_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14036,7 +14090,8 @@ define <4 x i64> @ugt_29_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [29,29] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14101,7 +14156,7 @@ define <4 x i64> @ugt_29_v4i64(<4 x i64> %0) { define <4 x i64> @ult_30_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_30_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14119,7 +14174,8 @@ define <4 x i64> @ult_30_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [30,30] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14184,7 +14240,7 @@ define <4 x i64> @ult_30_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_30_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_30_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14202,7 +14258,8 @@ define <4 x i64> @ugt_30_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [30,30] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14267,7 +14324,7 @@ define <4 x i64> @ugt_30_v4i64(<4 x i64> %0) { define <4 x i64> @ult_31_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_31_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14285,7 +14342,8 @@ define <4 x i64> @ult_31_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [31,31] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14350,7 +14408,7 @@ define <4 x i64> @ult_31_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_31_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_31_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14368,7 +14426,8 @@ define <4 x i64> @ugt_31_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [31,31] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14433,7 +14492,7 @@ define <4 x i64> @ugt_31_v4i64(<4 x i64> %0) { define <4 x i64> @ult_32_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_32_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14451,7 +14510,8 @@ define <4 x i64> @ult_32_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32,32] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14516,7 +14576,7 @@ define <4 x i64> @ult_32_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_32_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_32_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14534,7 +14594,8 @@ define <4 x i64> @ugt_32_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32,32] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14599,7 +14660,7 @@ define <4 x i64> @ugt_32_v4i64(<4 x i64> %0) { define <4 x i64> @ult_33_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_33_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14617,7 +14678,8 @@ define <4 x i64> @ult_33_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [33,33] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14682,7 +14744,7 @@ define <4 x i64> @ult_33_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_33_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_33_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14700,7 +14762,8 @@ define <4 x i64> @ugt_33_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [33,33] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14765,7 +14828,7 @@ define <4 x i64> @ugt_33_v4i64(<4 x i64> %0) { define <4 x i64> @ult_34_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_34_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14783,7 +14846,8 @@ define <4 x i64> @ult_34_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [34,34] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14848,7 +14912,7 @@ define <4 x i64> @ult_34_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_34_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_34_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14866,7 +14930,8 @@ define <4 x i64> @ugt_34_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [34,34] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14931,7 +14996,7 @@ define <4 x i64> @ugt_34_v4i64(<4 x i64> %0) { define <4 x i64> @ult_35_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_35_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14949,7 +15014,8 @@ define <4 x i64> @ult_35_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [35,35] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15014,7 +15080,7 @@ define <4 x i64> @ult_35_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_35_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_35_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15032,7 +15098,8 @@ define <4 x i64> @ugt_35_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [35,35] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15097,7 +15164,7 @@ define <4 x i64> @ugt_35_v4i64(<4 x i64> %0) { define <4 x i64> @ult_36_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_36_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15115,7 +15182,8 @@ define <4 x i64> @ult_36_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [36,36] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15180,7 +15248,7 @@ define <4 x i64> @ult_36_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_36_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_36_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15198,7 +15266,8 @@ define <4 x i64> @ugt_36_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [36,36] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15263,7 +15332,7 @@ define <4 x i64> @ugt_36_v4i64(<4 x i64> %0) { define <4 x i64> @ult_37_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_37_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15281,7 +15350,8 @@ define <4 x i64> @ult_37_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [37,37] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15346,7 +15416,7 @@ define <4 x i64> @ult_37_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_37_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_37_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15364,7 +15434,8 @@ define <4 x i64> @ugt_37_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [37,37] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15429,7 +15500,7 @@ define <4 x i64> @ugt_37_v4i64(<4 x i64> %0) { define <4 x i64> @ult_38_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_38_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15447,7 +15518,8 @@ define <4 x i64> @ult_38_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [38,38] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15512,7 +15584,7 @@ define <4 x i64> @ult_38_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_38_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_38_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15530,7 +15602,8 @@ define <4 x i64> @ugt_38_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [38,38] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15595,7 +15668,7 @@ define <4 x i64> @ugt_38_v4i64(<4 x i64> %0) { define <4 x i64> @ult_39_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_39_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15613,7 +15686,8 @@ define <4 x i64> @ult_39_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [39,39] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15678,7 +15752,7 @@ define <4 x i64> @ult_39_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_39_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_39_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15696,7 +15770,8 @@ define <4 x i64> @ugt_39_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [39,39] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15761,7 +15836,7 @@ define <4 x i64> @ugt_39_v4i64(<4 x i64> %0) { define <4 x i64> @ult_40_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_40_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15779,7 +15854,8 @@ define <4 x i64> @ult_40_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [40,40] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15844,7 +15920,7 @@ define <4 x i64> @ult_40_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_40_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_40_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15862,7 +15938,8 @@ define <4 x i64> @ugt_40_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [40,40] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15927,7 +16004,7 @@ define <4 x i64> @ugt_40_v4i64(<4 x i64> %0) { define <4 x i64> @ult_41_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_41_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15945,7 +16022,8 @@ define <4 x i64> @ult_41_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [41,41] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16010,7 +16088,7 @@ define <4 x i64> @ult_41_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_41_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_41_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16028,7 +16106,8 @@ define <4 x i64> @ugt_41_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [41,41] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16093,7 +16172,7 @@ define <4 x i64> @ugt_41_v4i64(<4 x i64> %0) { define <4 x i64> @ult_42_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_42_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16111,7 +16190,8 @@ define <4 x i64> @ult_42_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [42,42] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16176,7 +16256,7 @@ define <4 x i64> @ult_42_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_42_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_42_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16194,7 +16274,8 @@ define <4 x i64> @ugt_42_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [42,42] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16259,7 +16340,7 @@ define <4 x i64> @ugt_42_v4i64(<4 x i64> %0) { define <4 x i64> @ult_43_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_43_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16277,7 +16358,8 @@ define <4 x i64> @ult_43_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [43,43] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16342,7 +16424,7 @@ define <4 x i64> @ult_43_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_43_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_43_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16360,7 +16442,8 @@ define <4 x i64> @ugt_43_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [43,43] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16425,7 +16508,7 @@ define <4 x i64> @ugt_43_v4i64(<4 x i64> %0) { define <4 x i64> @ult_44_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_44_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16443,7 +16526,8 @@ define <4 x i64> @ult_44_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [44,44] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16508,7 +16592,7 @@ define <4 x i64> @ult_44_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_44_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_44_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16526,7 +16610,8 @@ define <4 x i64> @ugt_44_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [44,44] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16591,7 +16676,7 @@ define <4 x i64> @ugt_44_v4i64(<4 x i64> %0) { define <4 x i64> @ult_45_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_45_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16609,7 +16694,8 @@ define <4 x i64> @ult_45_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [45,45] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16674,7 +16760,7 @@ define <4 x i64> @ult_45_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_45_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_45_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16692,7 +16778,8 @@ define <4 x i64> @ugt_45_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [45,45] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16757,7 +16844,7 @@ define <4 x i64> @ugt_45_v4i64(<4 x i64> %0) { define <4 x i64> @ult_46_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_46_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16775,7 +16862,8 @@ define <4 x i64> @ult_46_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [46,46] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16840,7 +16928,7 @@ define <4 x i64> @ult_46_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_46_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_46_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16858,7 +16946,8 @@ define <4 x i64> @ugt_46_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [46,46] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16923,7 +17012,7 @@ define <4 x i64> @ugt_46_v4i64(<4 x i64> %0) { define <4 x i64> @ult_47_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_47_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16941,7 +17030,8 @@ define <4 x i64> @ult_47_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [47,47] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17006,7 +17096,7 @@ define <4 x i64> @ult_47_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_47_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_47_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17024,7 +17114,8 @@ define <4 x i64> @ugt_47_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [47,47] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17089,7 +17180,7 @@ define <4 x i64> @ugt_47_v4i64(<4 x i64> %0) { define <4 x i64> @ult_48_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_48_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17107,7 +17198,8 @@ define <4 x i64> @ult_48_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [48,48] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17172,7 +17264,7 @@ define <4 x i64> @ult_48_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_48_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_48_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17190,7 +17282,8 @@ define <4 x i64> @ugt_48_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [48,48] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17255,7 +17348,7 @@ define <4 x i64> @ugt_48_v4i64(<4 x i64> %0) { define <4 x i64> @ult_49_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_49_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17273,7 +17366,8 @@ define <4 x i64> @ult_49_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [49,49] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17338,7 +17432,7 @@ define <4 x i64> @ult_49_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_49_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_49_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17356,7 +17450,8 @@ define <4 x i64> @ugt_49_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [49,49] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17421,7 +17516,7 @@ define <4 x i64> @ugt_49_v4i64(<4 x i64> %0) { define <4 x i64> @ult_50_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_50_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17439,7 +17534,8 @@ define <4 x i64> @ult_50_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [50,50] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17504,7 +17600,7 @@ define <4 x i64> @ult_50_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_50_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_50_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17522,7 +17618,8 @@ define <4 x i64> @ugt_50_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [50,50] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17587,7 +17684,7 @@ define <4 x i64> @ugt_50_v4i64(<4 x i64> %0) { define <4 x i64> @ult_51_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_51_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17605,7 +17702,8 @@ define <4 x i64> @ult_51_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [51,51] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17670,7 +17768,7 @@ define <4 x i64> @ult_51_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_51_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_51_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17688,7 +17786,8 @@ define <4 x i64> @ugt_51_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [51,51] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17753,7 +17852,7 @@ define <4 x i64> @ugt_51_v4i64(<4 x i64> %0) { define <4 x i64> @ult_52_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_52_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17771,7 +17870,8 @@ define <4 x i64> @ult_52_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [52,52] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17836,7 +17936,7 @@ define <4 x i64> @ult_52_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_52_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_52_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17854,7 +17954,8 @@ define <4 x i64> @ugt_52_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [52,52] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17919,7 +18020,7 @@ define <4 x i64> @ugt_52_v4i64(<4 x i64> %0) { define <4 x i64> @ult_53_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_53_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17937,7 +18038,8 @@ define <4 x i64> @ult_53_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [53,53] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18002,7 +18104,7 @@ define <4 x i64> @ult_53_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_53_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_53_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18020,7 +18122,8 @@ define <4 x i64> @ugt_53_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [53,53] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18085,7 +18188,7 @@ define <4 x i64> @ugt_53_v4i64(<4 x i64> %0) { define <4 x i64> @ult_54_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_54_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18103,7 +18206,8 @@ define <4 x i64> @ult_54_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [54,54] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18168,7 +18272,7 @@ define <4 x i64> @ult_54_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_54_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_54_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18186,7 +18290,8 @@ define <4 x i64> @ugt_54_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [54,54] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18251,7 +18356,7 @@ define <4 x i64> @ugt_54_v4i64(<4 x i64> %0) { define <4 x i64> @ult_55_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_55_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18269,7 +18374,8 @@ define <4 x i64> @ult_55_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [55,55] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18334,7 +18440,7 @@ define <4 x i64> @ult_55_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_55_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_55_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18352,7 +18458,8 @@ define <4 x i64> @ugt_55_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [55,55] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18417,7 +18524,7 @@ define <4 x i64> @ugt_55_v4i64(<4 x i64> %0) { define <4 x i64> @ult_56_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_56_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18435,7 +18542,8 @@ define <4 x i64> @ult_56_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [56,56] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18500,7 +18608,7 @@ define <4 x i64> @ult_56_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_56_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_56_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18518,7 +18626,8 @@ define <4 x i64> @ugt_56_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [56,56] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18583,7 +18692,7 @@ define <4 x i64> @ugt_56_v4i64(<4 x i64> %0) { define <4 x i64> @ult_57_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_57_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18601,7 +18710,8 @@ define <4 x i64> @ult_57_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [57,57] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18666,7 +18776,7 @@ define <4 x i64> @ult_57_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_57_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_57_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18684,7 +18794,8 @@ define <4 x i64> @ugt_57_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [57,57] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18749,7 +18860,7 @@ define <4 x i64> @ugt_57_v4i64(<4 x i64> %0) { define <4 x i64> @ult_58_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_58_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18767,7 +18878,8 @@ define <4 x i64> @ult_58_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [58,58] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18832,7 +18944,7 @@ define <4 x i64> @ult_58_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_58_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_58_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18850,7 +18962,8 @@ define <4 x i64> @ugt_58_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [58,58] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18915,7 +19028,7 @@ define <4 x i64> @ugt_58_v4i64(<4 x i64> %0) { define <4 x i64> @ult_59_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_59_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18933,7 +19046,8 @@ define <4 x i64> @ult_59_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [59,59] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18998,7 +19112,7 @@ define <4 x i64> @ult_59_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_59_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_59_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19016,7 +19130,8 @@ define <4 x i64> @ugt_59_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [59,59] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19081,7 +19196,7 @@ define <4 x i64> @ugt_59_v4i64(<4 x i64> %0) { define <4 x i64> @ult_60_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_60_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19099,7 +19214,8 @@ define <4 x i64> @ult_60_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [60,60] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19164,7 +19280,7 @@ define <4 x i64> @ult_60_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_60_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_60_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19182,7 +19298,8 @@ define <4 x i64> @ugt_60_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [60,60] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19247,7 +19364,7 @@ define <4 x i64> @ugt_60_v4i64(<4 x i64> %0) { define <4 x i64> @ult_61_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_61_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19265,7 +19382,8 @@ define <4 x i64> @ult_61_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [61,61] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19330,7 +19448,7 @@ define <4 x i64> @ult_61_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_61_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_61_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19348,7 +19466,8 @@ define <4 x i64> @ugt_61_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [61,61] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19413,7 +19532,7 @@ define <4 x i64> @ugt_61_v4i64(<4 x i64> %0) { define <4 x i64> @ult_62_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_62_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19431,7 +19550,8 @@ define <4 x i64> @ult_62_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [62,62] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19496,7 +19616,7 @@ define <4 x i64> @ult_62_v4i64(<4 x i64> %0) { define <4 x i64> @ugt_62_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_62_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19514,7 +19634,8 @@ define <4 x i64> @ugt_62_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [62,62] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19579,7 +19700,7 @@ define <4 x i64> @ugt_62_v4i64(<4 x i64> %0) { define <4 x i64> @ult_63_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_63_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19597,7 +19718,8 @@ define <4 x i64> @ult_63_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [63,63] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll index 568c121..e4da7db 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll @@ -11,7 +11,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-LABEL: testv4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 @@ -49,11 +49,11 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; XOP-LABEL: testv4i64: ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOP-NEXT: vpand %xmm2, %xmm1, %xmm3 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; XOP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; XOP-NEXT: vpshlb %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 @@ -102,7 +102,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-LABEL: testv8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 @@ -152,11 +152,11 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; XOP-LABEL: testv8i32: ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOP-NEXT: vpand %xmm2, %xmm1, %xmm3 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; XOP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; XOP-NEXT: vpshlb %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 @@ -220,7 +220,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX1-LABEL: testv16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -262,11 +262,11 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; XOP-LABEL: testv16i16: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOP-NEXT: vpand %xmm1, %xmm0, %xmm2 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; XOP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; XOP-NEXT: vpshlb %xmm4, %xmm0, %xmm5 ; XOP-NEXT: vpshufb %xmm5, %xmm3, %xmm5 ; XOP-NEXT: vpaddb %xmm2, %xmm5, %xmm2 @@ -318,7 +318,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX1-LABEL: testv32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 @@ -351,11 +351,11 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; XOP-LABEL: testv32i8: ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOP-NEXT: vpand %xmm2, %xmm1, %xmm3 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; XOP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; XOP-NEXT: vpshlb %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll index 92c2ebc..df4c348 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -66,7 +66,8 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX1-LABEL: test_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -191,7 +192,8 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-LABEL: test_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 @@ -396,7 +398,8 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; ; AVX1-LABEL: test_v8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 @@ -748,7 +751,8 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-LABEL: test_v16i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 ; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll index 5cade4e..61a7a23 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -66,7 +66,8 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX1-LABEL: test_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -191,7 +192,8 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; ; AVX1-LABEL: test_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 @@ -400,7 +402,8 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-LABEL: test_v8i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 @@ -751,7 +754,8 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; ; AVX1-LABEL: test_v16i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm6 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 1f35d33..33f7a4e 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -58,7 +58,8 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX1-LABEL: var_rotate_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -281,7 +282,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -672,7 +673,8 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX1-LABEL: splatvar_rotate_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index 7cfdc5d..6c854b7 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -17,7 +17,8 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: var_rotate_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 @@ -93,10 +94,10 @@ define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX1-LABEL: var_rotate_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] @@ -177,11 +178,11 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX1-LABEL: var_rotate_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero @@ -317,7 +318,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5 @@ -326,7 +327,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm7 ; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7 @@ -334,7 +335,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm8 ; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3 @@ -517,7 +518,8 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: splatvar_rotate_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm4 @@ -1376,7 +1378,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 @@ -1637,7 +1639,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index 8c03a1a..938fba0 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -52,7 +52,8 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX1-LABEL: var_shift_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2 @@ -634,7 +635,8 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX1-LABEL: splatvar_shift_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 @@ -831,7 +833,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -945,7 +947,8 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi ; AVX1-LABEL: splatvar_modulo_shift_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 @@ -1134,7 +1137,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -1731,7 +1734,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -1801,7 +1804,8 @@ define <2 x i64> @PR52719(<2 x i64> %a0, i32 %a1) { ; AVX1-LABEL: PR52719: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll index a7d4e88..5a70e5d 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -20,7 +20,8 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: var_shift_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 @@ -654,7 +655,8 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: splatvar_shift_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 @@ -709,7 +711,8 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; X86-AVX1-LABEL: splatvar_shift_v4i64: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: # xmm2 = mem[0,0] ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 @@ -868,7 +871,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] ; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 @@ -970,7 +973,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; X86-AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 @@ -1008,7 +1011,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi ; AVX1-LABEL: splatvar_modulo_shift_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 @@ -1069,7 +1073,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi ; X86-AVX1-LABEL: splatvar_modulo_shift_v4i64: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: # xmm2 = mem[0,0] ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 @@ -1232,7 +1237,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] ; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 @@ -1338,7 +1343,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896] ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; X86-AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 @@ -1785,7 +1790,8 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { ; XOPAVX1-LABEL: splatconstant_shift_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551609,18446744073709551609] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744073709551609,18446744073709551609] +; XOPAVX1-NEXT: # xmm2 = mem[0,0] ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1943,9 +1949,9 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 @@ -1967,7 +1973,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; XOPAVX1-LABEL: splatconstant_shift_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] ; XOPAVX1-NEXT: vpshab %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshab %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2011,9 +2017,9 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 @@ -2062,7 +2068,8 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind { ; XOPAVX1-LABEL: shift32_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551584,18446744073709551584] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744073709551584,18446744073709551584] +; XOPAVX1-NEXT: # xmm2 = mem[0,0] ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2113,7 +2120,8 @@ define <4 x i64> @PR52719(<4 x i64> %a0, i32 %a1) { ; AVX1-LABEL: PR52719: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index 1fbdc3b..dfba0d9 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1341,7 +1341,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -1479,7 +1479,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -1617,7 +1617,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -2312,7 +2312,7 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -2382,7 +2382,7 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -2452,7 +2452,7 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index eed4637..77f5f26 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -348,18 +348,18 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 @@ -470,18 +470,18 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpsrlw $4, %xmm3, %xmm4 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; X86-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 ; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpsrlw $2, %xmm3, %xmm4 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; X86-AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpsrlw $1, %xmm3, %xmm4 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; X86-AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 @@ -1643,7 +1643,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 @@ -1659,7 +1659,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; XOPAVX1-LABEL: splatconstant_shift_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1687,7 +1687,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 617a2d9..deb1514 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -194,7 +194,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -1121,7 +1121,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index dd63565..c355eea 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -89,7 +89,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 @@ -134,7 +134,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; X86-AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 @@ -160,7 +160,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero @@ -249,7 +249,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4,4,5,5,6,6,7,7] ; X86-AVX1-NEXT: vpslld $23, %xmm2, %xmm4 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] ; X86-AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm4 ; X86-AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero @@ -294,13 +294,13 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 @@ -404,13 +404,13 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; X86-AVX1-NEXT: vpsllw $4, %xmm2, %xmm3 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; X86-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; X86-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 ; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; X86-AVX1-NEXT: vpsllw $2, %xmm2, %xmm3 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; X86-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 @@ -1240,7 +1240,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,64,32,16,8,4,2,1] ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] @@ -1331,7 +1331,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,64,32,16,8,4,2,1] ; X86-AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; X86-AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 ; X86-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] @@ -1531,7 +1531,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 @@ -1547,7 +1547,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; XOPAVX1-LABEL: splatconstant_shift_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1575,7 +1575,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpsllw $3, %xmm1, %xmm1 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index e7600d2..4d4642b 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -130,7 +130,7 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -253,7 +253,7 @@ define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 95320ec..57a3c95 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -615,7 +615,7 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31( ; ; AVX1-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -634,7 +634,7 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31( ; ; XOPAVX1-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -673,7 +673,7 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31( ; ; AVX1-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -692,7 +692,7 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31( ; ; XOPAVX1-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] ; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -745,7 +745,8 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31( ; ; AVX1-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -764,7 +765,8 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31( ; ; XOPAVX1-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] +; XOPAVX1-NEXT: # xmm2 = mem[0,0] ; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -956,7 +958,7 @@ define <16 x i8> @shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30( ; ; AVX1-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1012,7 +1014,8 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31( ; ; AVX1-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -1077,7 +1080,8 @@ define <16 x i8> @load_fold_pblendvb(ptr %px, <16 x i8> %y) { ; ; AVX1-LABEL: load_fold_pblendvb: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -1096,7 +1100,8 @@ define <16 x i8> @load_fold_pblendvb(ptr %px, <16 x i8> %y) { ; ; XOPAVX1-LABEL: load_fold_pblendvb: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; XOPAVX1-NEXT: # xmm1 = mem[0,0] ; XOPAVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -1139,7 +1144,8 @@ define <16 x i8> @load_fold_pblendvb_commute(ptr %px, <16 x i8> %y) { ; ; AVX1-LABEL: load_fold_pblendvb_commute: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -1160,7 +1166,8 @@ define <16 x i8> @load_fold_pblendvb_commute(ptr %px, <16 x i8> %y) { ; ; XOPAVX1-LABEL: load_fold_pblendvb_commute: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; XOPAVX1-NEXT: # xmm1 = mem[0,0] ; XOPAVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -2185,7 +2192,7 @@ define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) { ; ; AVX1-LABEL: PR12412: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 8cfeb2a..f73081c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -6094,7 +6094,8 @@ define <16 x i16> @shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_u ; AVX1-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,1,4,5,8,9,4,5,0,1,4,5,8,9,4,5] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index aebcb68..58401a2 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1670,7 +1670,7 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_ ; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1684,7 +1684,7 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_ ; XOPAVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2475,7 +2475,7 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16] ; XOPAVX1-NEXT: vpperm %xmm4, %xmm2, %xmm3, %xmm2 ; XOPAVX1-NEXT: vpperm %xmm4, %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -2550,10 +2550,12 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_ ; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [15,14,13,12,11,10,9,8,15,14,13,12,11,10,9,8] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 @@ -4809,7 +4811,7 @@ define <32 x i8> @shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_ ; AVX1-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_16_18_20_22_24_26_28_30_48_50_52_54_56_58_60_62: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 @@ -4864,7 +4866,7 @@ define <32 x i8> @shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_ ; AVX1-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 @@ -5063,7 +5065,7 @@ define <32 x i8> @PR55066(<32 x i8> %a0) { ; AVX1-LABEL: PR55066: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,0,4,8,12,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index c6006a9..abd9fd7 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1753,7 +1753,7 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -1847,7 +1847,7 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -3319,7 +3319,8 @@ define void @PR45604(ptr %dst, ptr %src) { ; AVX1-NEXT: vmovdqa (%rsi), %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index 6c57956..3f935b2 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -229,7 +229,8 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 @@ -311,7 +312,7 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 @@ -364,7 +365,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 @@ -1009,7 +1010,8 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 @@ -1091,7 +1093,7 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 @@ -1144,7 +1146,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 @@ -1859,7 +1861,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 @@ -1985,7 +1988,7 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 @@ -2038,7 +2041,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 @@ -2325,7 +2328,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [255,255] +; AVX1-NEXT: # xmm8 = mem[0,0] ; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 @@ -2445,7 +2449,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255] ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 @@ -2497,7 +2501,7 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index 6ecc398..804fd89 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -109,7 +109,8 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) { ; ; AVX1-LABEL: trunc_packus_v2i64_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -270,7 +271,8 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; ; AVX1-LABEL: trunc_packus_v2i64_v2i32_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -501,7 +503,8 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_packus_v4i64_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -922,7 +925,8 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [4294967295,4294967295] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -1127,7 +1131,8 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX1-LABEL: trunc_packus_v2i64_v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1304,7 +1309,8 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_packus_v2i64_v2i16_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1562,7 +1568,8 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_packus_v4i64_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -1816,7 +1823,8 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_packus_v4i64_v4i16_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -2240,7 +2248,8 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [65535,65535] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -2827,7 +2836,8 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) { ; ; AVX1-LABEL: trunc_packus_v2i64_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -2992,7 +3002,8 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_packus_v2i64_v2i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -3237,7 +3248,8 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; AVX1-LABEL: trunc_packus_v4i64_v4i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [255,255] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 @@ -3247,7 +3259,7 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -3495,7 +3507,8 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; AVX1-LABEL: trunc_packus_v4i64_v4i8_store: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [255,255] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 @@ -3505,7 +3518,7 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -3909,7 +3922,8 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -4311,7 +4325,8 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -5009,7 +5024,8 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; AVX1-LABEL: trunc_packus_v16i64_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [255,255] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index c378281..baed531 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -113,10 +113,12 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) { ; ; AVX1-LABEL: trunc_ssat_v2i64_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -268,10 +270,12 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; ; AVX1-LABEL: trunc_ssat_v2i64_v2i32_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -501,13 +505,15 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_ssat_v4i64_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 @@ -932,7 +938,8 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2147483647,2147483647] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [2147483647,2147483647] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -941,7 +948,8 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 @@ -1137,10 +1145,12 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX1-LABEL: trunc_ssat_v2i64_v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1306,10 +1316,12 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1556,13 +1568,15 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_ssat_v4i64_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 @@ -1802,13 +1816,15 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_ssat_v4i64_v4i16_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 @@ -2212,7 +2228,8 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32767,32767] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [32767,32767] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -2221,7 +2238,8 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 @@ -2564,10 +2582,12 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) { ; ; AVX1-LABEL: trunc_ssat_v2i64_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2722,10 +2742,12 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_ssat_v2i64_v2i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2968,17 +2990,19 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; AVX1-LABEL: trunc_ssat_v4i64_v4i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [127,127] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -3227,17 +3251,19 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; AVX1-LABEL: trunc_ssat_v4i64_v4i8_store: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [127,127] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -3643,7 +3669,8 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [127,127] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -3652,7 +3679,8 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 @@ -4057,7 +4085,8 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [127,127] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -4066,7 +4095,8 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 @@ -4773,7 +4803,8 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; AVX1-LABEL: trunc_ssat_v16i64_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [127,127] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1 @@ -4797,7 +4828,8 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm2, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 ; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 ; AVX1-NEXT: vblendvpd %xmm9, %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9 @@ -5967,13 +5999,13 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; AVX1-LABEL: trunc_ssat_v16i32_v16i24: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [8388607,8388607,8388607,8388607] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [8388607,8388607,8388607,8388607] ; AVX1-NEXT: vpminsd %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpminsd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4286578688,4286578688,4286578688,4286578688] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4286578688,4286578688,4286578688,4286578688] ; AVX1-NEXT: vpmaxsd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxsd %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index a830a96..1078512 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -72,7 +72,8 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) { ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] ; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -181,7 +182,8 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] ; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -323,9 +325,11 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_usat_v4i64_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpxor %xmm1, %xmm4, %xmm1 @@ -605,9 +609,11 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295] ; AVX1-NEXT: # xmm7 = mem[0,0] @@ -759,7 +765,8 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) { ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] ; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -882,7 +889,8 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) { ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] ; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1051,9 +1059,11 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; AVX1-LABEL: trunc_usat_v4i64_v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [65535,65535] ; AVX1-NEXT: # xmm5 = mem[0,0] @@ -1218,9 +1228,11 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; AVX1-LABEL: trunc_usat_v4i64_v4i16_store: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [65535,65535] ; AVX1-NEXT: # xmm5 = mem[0,0] @@ -1479,9 +1491,11 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535] ; AVX1-NEXT: # xmm7 = mem[0,0] @@ -1769,7 +1783,7 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) { ; AVX1-LABEL: trunc_usat_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 @@ -1935,7 +1949,7 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(ptr %p0) { ; ; AVX1-LABEL: trunc_usat_v16i32_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [65535,65535,65535,65535] ; AVX1-NEXT: vpminud 16(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpminud (%rdi), %xmm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 @@ -2033,7 +2047,8 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) { ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] ; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2145,7 +2160,8 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) { ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] ; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2299,9 +2315,11 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_usat_v4i64_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255] ; AVX1-NEXT: # xmm4 = mem[0,0] @@ -2310,7 +2328,7 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) { ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -2470,9 +2488,11 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_usat_v4i64_v4i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255] ; AVX1-NEXT: # xmm4 = mem[0,0] @@ -2481,7 +2501,7 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -2723,9 +2743,11 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) { ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] ; AVX1-NEXT: # xmm7 = mem[0,0] @@ -2965,9 +2987,11 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) { ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] ; AVX1-NEXT: # xmm7 = mem[0,0] @@ -3357,9 +3381,11 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] ; AVX1-NEXT: # xmm7 = mem[0,0] @@ -3693,7 +3719,7 @@ define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) { ; AVX1-LABEL: trunc_usat_v8i32_v8i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255] ; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 @@ -3806,7 +3832,7 @@ define void @trunc_usat_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { ; AVX1-LABEL: trunc_usat_v8i32_v8i8_store: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255] ; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 @@ -3964,7 +3990,7 @@ define <16 x i8> @trunc_usat_v16i32_v16i8(ptr %p0) { ; ; AVX1-LABEL: trunc_usat_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255] ; AVX1-NEXT: vpminud 16(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpminud (%rdi), %xmm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 @@ -4107,7 +4133,7 @@ define void @trunc_usat_v16i32_v16i8_store(ptr %p0, ptr %p1) { ; ; AVX1-LABEL: trunc_usat_v16i32_v16i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255] ; AVX1-NEXT: vpminud 16(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpminud (%rdi), %xmm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 @@ -4323,7 +4349,7 @@ define <16 x i8> @trunc_usat_v16i16_v16i8(<16 x i16> %a0) { ; AVX1-LABEL: trunc_usat_v16i16_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -4442,7 +4468,7 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) { ; ; AVX1-LABEL: trunc_usat_v32i16_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpminuw 16(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpminuw (%rdi), %xmm0, %xmm2 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 @@ -4688,7 +4714,7 @@ define <32 x i8> @trunc_usat_v32i32_v32i8(ptr %p0) { ; ; AVX1-LABEL: trunc_usat_v32i32_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255] ; AVX1-NEXT: vpminud 16(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpminud (%rdi), %xmm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index 8c85c82..6f2e05b 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -571,7 +571,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) { ; AVX1-LABEL: trunc8i32_8i8: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1817,7 +1817,7 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { ; ; AVX1-LABEL: trunc2x8i16_16i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll index 0a19c16..3d5947d 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -105,7 +105,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -306,7 +306,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -527,7 +527,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -768,7 +768,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -997,7 +997,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1227,7 +1227,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1443,7 +1443,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1643,7 +1643,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll index f2c8471..cf3803a 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -18,7 +18,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 @@ -143,7 +143,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 @@ -268,7 +268,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 @@ -417,7 +417,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 @@ -565,7 +565,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 @@ -713,7 +713,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 @@ -862,7 +862,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 @@ -1004,7 +1004,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll index 650ee0e..18bd9e7 100644 --- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll @@ -356,7 +356,7 @@ define <16 x i1> @ugt_v16i8(<16 x i8> %x, <16 x i8> %y) { ; AVX1-LABEL: ugt_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -393,7 +393,7 @@ define <16 x i1> @ult_v16i8(<16 x i8> %x, <16 x i8> %y) { ; AVX1-LABEL: ult_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -430,7 +430,7 @@ define <16 x i1> @uge_v16i8(<16 x i8> %x, <16 x i8> %y) { ; AVX1-LABEL: uge_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -469,7 +469,7 @@ define <16 x i1> @ule_v16i8(<16 x i8> %x, <16 x i8> %y) { ; AVX1-LABEL: ule_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll index d180bbe..c3e9a2b 100644 --- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll @@ -451,7 +451,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_1(<16 x i8> %a0) { ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: retl @@ -478,7 +478,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_1(<16 x i8> %a0) { ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: retq @@ -509,7 +509,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_4(<16 x i8> %a0) { ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: retl @@ -536,7 +536,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_4(<16 x i8> %a0) { ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: retq @@ -567,7 +567,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_5(<16 x i8> %a0) { ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: retl @@ -594,7 +594,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_5(<16 x i8> %a0) { ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: retq @@ -625,7 +625,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_6(<16 x i8> %a0) { ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: retl @@ -652,7 +652,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_6(<16 x i8> %a0) { ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll index 6d42147..6ba2057 100644 --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -157,11 +157,11 @@ define <32 x i8> @PR22706(<32 x i1> %x) { ; AVX1: ## %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vselect-minmax.ll b/llvm/test/CodeGen/X86/vselect-minmax.ll index 3a813cc..cb0542c 100644 --- a/llvm/test/CodeGen/X86/vselect-minmax.ll +++ b/llvm/test/CodeGen/X86/vselect-minmax.ll @@ -5064,7 +5064,8 @@ define <8 x i64> @test125(<8 x i64> %a, <8 x i64> %b) { ; AVX1-LABEL: test125: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 ; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 @@ -5211,7 +5212,8 @@ define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) { ; AVX1-LABEL: test126: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 ; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 @@ -5357,7 +5359,8 @@ define <8 x i64> @test127(<8 x i64> %a, <8 x i64> %b) { ; AVX1-LABEL: test127: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 ; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 @@ -5503,7 +5506,8 @@ define <8 x i64> @test128(<8 x i64> %a, <8 x i64> %b) { ; AVX1-LABEL: test128: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 ; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 @@ -7481,7 +7485,8 @@ define <8 x i64> @test156(<8 x i64> %a, <8 x i64> %b) { ; AVX1-LABEL: test156: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 ; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 @@ -7628,7 +7633,8 @@ define <8 x i64> @test159(<8 x i64> %a, <8 x i64> %b) { ; AVX1-LABEL: test159: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 ; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 @@ -7775,7 +7781,8 @@ define <8 x i64> @test160(<8 x i64> %a, <8 x i64> %b) { ; AVX1-LABEL: test160: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 ; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 @@ -8204,7 +8211,8 @@ define <4 x i64> @test165(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: test165: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 @@ -8301,7 +8309,8 @@ define <4 x i64> @test166(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: test166: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 @@ -8397,7 +8406,8 @@ define <4 x i64> @test167(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: test167: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 @@ -8493,7 +8503,8 @@ define <4 x i64> @test168(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: test168: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 @@ -8915,7 +8926,8 @@ define <4 x i64> @test173(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: test173: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 @@ -9011,7 +9023,8 @@ define <4 x i64> @test174(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: test174: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 @@ -9108,7 +9121,8 @@ define <4 x i64> @test175(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: test175: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 @@ -9205,7 +9219,8 @@ define <4 x i64> @test176(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: test176: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 @@ -9512,7 +9527,8 @@ define <2 x i64> @test181(<2 x i64> %a, <2 x i64> %b) { ; ; AVX1-LABEL: test181: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9580,7 +9596,8 @@ define <2 x i64> @test182(<2 x i64> %a, <2 x i64> %b) { ; ; AVX1-LABEL: test182: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9648,7 +9665,8 @@ define <2 x i64> @test183(<2 x i64> %a, <2 x i64> %b) { ; ; AVX1-LABEL: test183: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9716,7 +9734,8 @@ define <2 x i64> @test184(<2 x i64> %a, <2 x i64> %b) { ; ; AVX1-LABEL: test184: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10018,7 +10037,8 @@ define <2 x i64> @test189(<2 x i64> %a, <2 x i64> %b) { ; ; AVX1-LABEL: test189: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10086,7 +10106,8 @@ define <2 x i64> @test190(<2 x i64> %a, <2 x i64> %b) { ; ; AVX1-LABEL: test190: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10154,7 +10175,8 @@ define <2 x i64> @test191(<2 x i64> %a, <2 x i64> %b) { ; ; AVX1-LABEL: test191: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10222,7 +10244,8 @@ define <2 x i64> @test192(<2 x i64> %a, <2 x i64> %b) { ; ; AVX1-LABEL: test192: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll index 7a9b66d..ffc929c 100644 --- a/llvm/test/CodeGen/X86/vselect-pcmp.ll +++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll @@ -531,7 +531,8 @@ define <4 x i64> @blend_splat1_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOP-NEXT: vpsllq $63, %xmm3, %xmm3 -; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: # xmm4 = mem[0,0] ; XOP-NEXT: vpshaq %xmm4, %xmm3, %xmm3 ; XOP-NEXT: vpsllq $63, %xmm0, %xmm0 ; XOP-NEXT: vpshaq %xmm4, %xmm0, %xmm0 @@ -860,7 +861,8 @@ define <4 x i64> @blend_splat_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOP-NEXT: vpsllq $62, %xmm3, %xmm3 -; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: # xmm4 = mem[0,0] ; XOP-NEXT: vpshaq %xmm4, %xmm3, %xmm3 ; XOP-NEXT: vpsllq $62, %xmm0, %xmm0 ; XOP-NEXT: vpshaq %xmm4, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 2fd2afd..799c11d 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -431,7 +431,7 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind { define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; AVX1-LABEL: interleaved_load_vf16_i8_stride4: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm3 @@ -439,35 +439,35 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -558,7 +558,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; AVX1-LABEL: interleaved_load_vf32_i8_stride4: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 @@ -566,7 +566,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm4 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm5 ; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm7 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] @@ -582,11 +582,11 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm8 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2,3],xmm10[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm8 ; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm12 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13 ; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] @@ -600,11 +600,11 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7] ; AVX1-NEXT: vpcmpeqb %xmm9, %xmm10, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX1-NEXT: vpshufb %xmm10, %xmm3, %xmm11 ; AVX1-NEXT: vpshufb %xmm10, %xmm2, %xmm12 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13 ; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] @@ -616,11 +616,11 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm12 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index c93d4a6..b5f9c26 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -1075,7 +1075,8 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] @@ -2410,7 +2411,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 @@ -2705,7 +2706,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 @@ -3001,7 +3002,8 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index 0ea821b..bbd6416 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -894,7 +894,8 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] @@ -1892,7 +1893,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 @@ -2145,7 +2146,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 @@ -2398,7 +2399,8 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX-NEXT: # xmm2 = mem[0,0] ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -- 2.7.4