From: Simon Pilgrim Date: Thu, 28 Mar 2019 10:25:13 +0000 (+0000) Subject: [X85][AVX] Add missing vXi16 broadcast fold patterns X-Git-Tag: llvmorg-10-init~9001 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=22be913ac00bd220af46f7e1f4f5ea59027c0b11;p=platform%2Fupstream%2Fllvm.git [X85][AVX] Add missing vXi16 broadcast fold patterns Now that D59484 has landed its easier to add these. Added missing AVX512BW v32i16 equivalents while I was at it. llvm-svn: 357155 --- diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 7148710..f7b9216 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1394,12 +1394,30 @@ let Predicates = [HasVLX, HasBWI] in { def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), (VPBROADCASTWZ256m addr:$src)>; def : Pat<(v8i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWZ128m addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWZ128m addr:$src)>; def : Pat<(v16i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWZ256m addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWZ256m addr:$src)>; } +let Predicates = [HasBWI] in { + // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. + // This means we'll encounter truncated i32 loads; match that here. + def : Pat<(v32i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWZm addr:$src)>; + def : Pat<(v32i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWZm addr:$src)>; + def : Pat<(v32i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWZm addr:$src)>; +} //===----------------------------------------------------------------------===// // AVX-512 BROADCAST SUBVECTORS diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index f0c781c..c3f471e 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -7955,9 +7955,15 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), (VPBROADCASTWYrm addr:$src)>; def : Pat<(v8i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWrm addr:$src)>; def : Pat<(v16i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWYrm addr:$src)>; } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index 020354b..6cd0aba 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -2666,18 +2666,10 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; -; AVX2-LABEL: insert_dup_mem_v8i16_sext_i16: -; AVX2: # %bb.0: -; AVX2-NEXT: movzwl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: insert_dup_mem_v8i16_sext_i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: movzwl (%rdi), %eax -; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_sext_i16: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2OR512VL-NEXT: retq %tmp = load i16, i16* %ptr, align 2 %tmp1 = sext i16 %tmp to i32 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 7d2b6c8..becc195 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -4729,18 +4729,10 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16: -; AVX2: # %bb.0: -; AVX2-NEXT: movzwl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: movzwl (%rdi), %eax -; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_sext_i16: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX2OR512VL-NEXT: retq %tmp = load i16, i16* %ptr, align 2 %tmp1 = sext i16 %tmp to i32 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll index d9a0ab5..a67f0c4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -220,8 +220,7 @@ define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) { ; ; SKX-LABEL: insert_dup_mem_v32i16_i32: ; SKX: ## %bb.0: -; SKX-NEXT: movl (%rdi), %eax -; SKX-NEXT: vpbroadcastw %eax, %zmm0 +; SKX-NEXT: vpbroadcastw (%rdi), %zmm0 ; SKX-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 @@ -233,16 +232,13 @@ define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) { define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) { ; KNL-LABEL: insert_dup_mem_v32i16_sext_i16: ; KNL: ## %bb.0: -; KNL-NEXT: movzwl (%rdi), %eax -; KNL-NEXT: vmovd %eax, %xmm0 -; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 +; KNL-NEXT: vpbroadcastw (%rdi), %ymm0 ; KNL-NEXT: vmovdqa %ymm0, %ymm1 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_mem_v32i16_sext_i16: ; SKX: ## %bb.0: -; SKX-NEXT: movzwl (%rdi), %eax -; SKX-NEXT: vpbroadcastw %eax, %zmm0 +; SKX-NEXT: vpbroadcastw (%rdi), %zmm0 ; SKX-NEXT: retq %tmp = load i16, i16* %ptr, align 2 %tmp1 = sext i16 %tmp to i32 @@ -261,8 +257,7 @@ define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(i32* %ptr) #0 { ; ; SKX-LABEL: insert_dup_elt1_mem_v32i16_i32: ; SKX: ## %bb.0: -; SKX-NEXT: movzwl 2(%rdi), %eax -; SKX-NEXT: vpbroadcastw %eax, %zmm0 +; SKX-NEXT: vpbroadcastw 2(%rdi), %zmm0 ; SKX-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 @@ -280,8 +275,7 @@ define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 { ; ; SKX-LABEL: insert_dup_elt3_mem_v32i16_i32: ; SKX: ## %bb.0: -; SKX-NEXT: movzwl 2(%rdi), %eax -; SKX-NEXT: vpbroadcastw %eax, %zmm0 +; SKX-NEXT: vpbroadcastw 2(%rdi), %zmm0 ; SKX-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1