From 8a84c747d2de2e99e035d8e072a00795b406ca6e Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sat, 25 Aug 2018 14:56:05 +0000 Subject: [PATCH] [x86] try harder to use broadcast to load a scalar into vector reg This is a preliminary step for a preliminary step for D50992. I noticed that x86 often misses chances to load a scalar directly into a vector register. So this patch is just allowing more of those cases to match a broadcast op in lowerBuildVectorAsBroadcast(). The old code comment said it doesn't make sense to use a broadcast when we're loading a single element and everything else is undef, but I think that's the best case in the improved tests in insert-loaded-scalar.ll. We avoid scalar-to-vector-register move and/or less efficient shuffling. Note that there are some existing types that were already producing a broadcast, but that happens semi-accidentally. Ie, it's not happening as part of lowerBuildVectorAsBroadcast(). The build vector gets expanded into load + shuffle, and then shuffle lowering produces the broadcast. Description of the other test diffs: 1. avx-basic.ll - replacing load+shufle is a win. 2. sse3-avx-addsub-2.ll - vmovddup vs. vbroadcastss is neutral 3. sse41.ll - don't care - we convert that intrinsic to generic IR now, so this test is deprecated 4. vector-shuffle-128-v8.ll / vector-shuffle-256-v16.ll - pshufb alternatives with an extra instruction are not obviously bad Differential Revision: https://reviews.llvm.org/D51125 llvm-svn: 340685 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 18 ++- llvm/test/CodeGen/X86/avx-basic.ll | 3 +- llvm/test/CodeGen/X86/insert-loaded-scalar.ll | 163 +++++++++++------------- llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll | 22 +++- llvm/test/CodeGen/X86/sse41.ll | 10 +- llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll | 5 +- llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll | 5 +- 7 files changed, 115 insertions(+), 111 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3fa4e9f..111096b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7122,9 +7122,9 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, } } - // We need a splat of a single value to use broadcast, and it doesn't - // make any sense if the value is only in one element of the vector. - if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumUndefElts = UndefElements.count(); + if (!Ld || (NumElts - NumUndefElts) <= 1) { APInt SplatValue, Undef; unsigned SplatBitSize; bool HasUndef; @@ -7200,7 +7200,17 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, } } } - return SDValue(); + + // If we are moving a scalar into a vector (Ld must be set and all elements + // but 1 are undef) and that operation is not obviously supported by + // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast. + // That's better than general shuffling and may eliminate a load to GPR and + // move from scalar to vector register. + if (!Ld || NumElts - NumUndefElts != 1) + return SDValue(); + unsigned ScalarSize = Ld.getValueSizeInBits(); + if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64))) + return SDValue(); } bool ConstSplatVal = diff --git a/llvm/test/CodeGen/X86/avx-basic.ll b/llvm/test/CodeGen/X86/avx-basic.ll index d27a641..b7c9b69 100644 --- a/llvm/test/CodeGen/X86/avx-basic.ll +++ b/llvm/test/CodeGen/X86/avx-basic.ll @@ -76,8 +76,7 @@ define <4 x i64> @ISelCrash(<4 x i64> %a) nounwind uwtable readnone ssp { define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind { ; CHECK-LABEL: VMOVZQI2PQI: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; CHECK-NEXT: vbroadcastss (%rdi), %ymm0 ; CHECK-NEXT: retq %ptrcast.i33.i = bitcast [0 x float]* %aFOO to i32* %val.i34.i = load i32, i32* %ptrcast.i33.i, align 4 diff --git a/llvm/test/CodeGen/X86/insert-loaded-scalar.ll b/llvm/test/CodeGen/X86/insert-loaded-scalar.ll index ec6b99c..81cb533 100644 --- a/llvm/test/CodeGen/X86/insert-loaded-scalar.ll +++ b/llvm/test/CodeGen/X86/insert-loaded-scalar.ll @@ -10,11 +10,16 @@ define <16 x i8> @load8_ins_elt0_v16i8(i8* %p) nounwind { ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load8_ins_elt0_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: movzbl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load8_ins_elt0_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load8_ins_elt0_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX2-NEXT: retq %x = load i8, i8* %p %ins = insertelement <16 x i8> undef, i8 %x, i32 0 ret <16 x i8> %ins @@ -27,11 +32,16 @@ define <8 x i16> @load16_ins_elt0_v8i16(i16* %p) nounwind { ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load16_ins_elt0_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load16_ins_elt0_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load16_ins_elt0_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-NEXT: retq %x = load i16, i16* %p %ins = insertelement <8 x i16> undef, i16 %x, i32 0 ret <8 x i16> %ins @@ -105,12 +115,17 @@ define <16 x i8> @load8_ins_eltc_v16i8(i8* %p) nounwind { ; SSE-NEXT: pslld $24, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load8_ins_eltc_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: movzbl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load8_ins_eltc_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load8_ins_eltc_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX2-NEXT: retq %x = load i8, i8* %p %ins = insertelement <16 x i8> undef, i8 %x, i32 3 ret <16 x i8> %ins @@ -147,17 +162,10 @@ define <4 x i32> @load32_ins_eltc_v4i32(i32* %p) nounwind { ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: load32_ins_eltc_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: load32_ins_eltc_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: load32_ins_eltc_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX-NEXT: retq %x = load i32, i32* %p %ins = insertelement <4 x i32> undef, i32 %x, i32 2 ret <4 x i32> %ins @@ -223,11 +231,16 @@ define <32 x i8> @load8_ins_elt0_v32i8(i8* %p) nounwind { ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load8_ins_elt0_v32i8: -; AVX: # %bb.0: -; AVX-NEXT: movzbl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load8_ins_elt0_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load8_ins_elt0_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 +; AVX2-NEXT: retq %x = load i8, i8* %p %ins = insertelement <32 x i8> undef, i8 %x, i32 0 ret <32 x i8> %ins @@ -240,11 +253,16 @@ define <16 x i16> @load16_ins_elt0_v16i16(i16* %p) nounwind { ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load16_ins_elt0_v16i16: -; AVX: # %bb.0: -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load16_ins_elt0_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load16_ins_elt0_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX2-NEXT: retq %x = load i16, i16* %p %ins = insertelement <16 x i16> undef, i16 %x, i32 0 ret <16 x i16> %ins @@ -328,10 +346,7 @@ define <32 x i8> @load8_ins_eltc_v32i8(i8* %p) nounwind { ; ; AVX2-LABEL: load8_ins_eltc_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: movzbl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpsllq $40, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX2-NEXT: retq %x = load i8, i8* %p %ins = insertelement <32 x i8> undef, i8 %x, i32 21 @@ -356,10 +371,7 @@ define <16 x i16> @load16_ins_eltc_v16i16(i16* %p) nounwind { ; ; AVX2-LABEL: load16_ins_eltc_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: movzwl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX2-NEXT: retq %x = load i16, i16* %p %ins = insertelement <16 x i16> undef, i16 %x, i32 11 @@ -373,18 +385,10 @@ define <8 x i32> @load32_ins_eltc_v8i32(i32* %p) nounwind { ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: load32_ins_eltc_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load32_ins_eltc_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: load32_ins_eltc_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX-NEXT: retq %x = load i32, i32* %p %ins = insertelement <8 x i32> undef, i32 %x, i32 7 ret <8 x i32> %ins @@ -397,17 +401,10 @@ define <4 x i64> @load64_ins_eltc_v4i64(i64* %p) nounwind { ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: load64_ins_eltc_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load64_ins_eltc_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: load64_ins_eltc_v4i64: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX-NEXT: retq %x = load i64, i64* %p %ins = insertelement <4 x i64> undef, i64 %x, i32 3 ret <4 x i64> %ins @@ -420,18 +417,10 @@ define <8 x float> @load32_ins_eltc_v8f32(float* %p) nounwind { ; SSE-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] ; SSE-NEXT: retq ; -; AVX1-LABEL: load32_ins_eltc_v8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load32_ins_eltc_v8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: load32_ins_eltc_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX-NEXT: retq %x = load float, float* %p %ins = insertelement <8 x float> undef, float %x, i32 5 ret <8 x float> %ins @@ -443,16 +432,10 @@ define <4 x double> @load64_ins_eltc_v4f64(double* %p) nounwind { ; SSE-NEXT: movddup {{.*#+}} xmm1 = mem[0,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: load64_ins_eltc_v4f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load64_ins_eltc_v4f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: load64_ins_eltc_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX-NEXT: retq %x = load double, double* %p %ins = insertelement <4 x double> undef, double %x, i32 3 ret <4 x double> %ins diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll index 9f9fe23..d563180 100644 --- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -274,13 +274,21 @@ define <4 x float> @test11(<4 x float> %A, <4 x float> %B) { ; SSE-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: test11: -; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: test11: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: retq +; +; AVX512-LABEL: test11: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = extractelement <4 x float> %A, i32 2 %2 = extractelement <4 x float> %B, i32 2 %sub = fsub float %1, %2 diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index c466ee7..b5c3726 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -97,8 +97,9 @@ define <2 x i64> @pmovzxbq_1() nounwind { ; X86-AVX512: ## %bb.0: ## %entry ; X86-AVX512-NEXT: movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4 -; X86-AVX512-NEXT: vpmovzxbq (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00] -; X86-AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX512-NEXT: vpbroadcastw (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x00] +; X86-AVX512-NEXT: vpmovzxbq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xc0] +; X86-AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: pmovzxbq_1: @@ -121,8 +122,9 @@ define <2 x i64> @pmovzxbq_1() nounwind { ; X64-AVX512: ## %bb.0: ## %entry ; X64-AVX512-NEXT: movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load -; X64-AVX512-NEXT: vpmovzxbq (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00] -; X64-AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX512-NEXT: vpbroadcastw (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x00] +; X64-AVX512-NEXT: vpmovzxbq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xc0] +; X64-AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; X64-AVX512-NEXT: retq ## encoding: [0xc3] entry: %0 = load i16, i16* @g16, align 2 ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index 367a72f..c59db9b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -2619,8 +2619,9 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(i32* %ptr) { ; ; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i32: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 01998a5..3566ebd 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -4597,8 +4597,9 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(i32* %ptr) #0 { define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 { ; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -- 2.7.4