Now that D59484 has landed its easier to add these.
Added missing AVX512BW v32i16 equivalents while I was at it.
llvm-svn: 357155
def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
(VPBROADCASTWZ256m addr:$src)>;
def : Pat<(v8i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWZ128m addr:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWZ128m addr:$src)>;
def : Pat<(v16i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWZ256m addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWZ256m addr:$src)>;
}
+let Predicates = [HasBWI] in {
+ // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
+ // This means we'll encounter truncated i32 loads; match that here.
+ def : Pat<(v32i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWZm addr:$src)>;
+ def : Pat<(v32i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWZm addr:$src)>;
+ def : Pat<(v32i16 (X86VBroadcast
+ (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+ (VPBROADCASTWZm addr:$src)>;
+}
//===----------------------------------------------------------------------===//
// AVX-512 BROADCAST SUBVECTORS
def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
(VPBROADCASTWYrm addr:$src)>;
def : Pat<(v8i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWrm addr:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWrm addr:$src)>;
def : Pat<(v16i16 (X86VBroadcast
+ (i16 (trunc (i32 (extloadi16 addr:$src)))))),
+ (VPBROADCASTWYrm addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWYrm addr:$src)>;
}
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: insert_dup_mem_v8i16_sext_i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movzwl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_dup_mem_v8i16_sext_i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movzwl (%rdi), %eax
-; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_sext_i16:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %xmm0
+; AVX2OR512VL-NEXT: retq
%tmp = load i16, i16* %ptr, align 2
%tmp1 = sext i16 %tmp to i32
%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movzwl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movzwl (%rdi), %eax
-; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_sext_i16:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %ymm0
+; AVX2OR512VL-NEXT: retq
%tmp = load i16, i16* %ptr, align 2
%tmp1 = sext i16 %tmp to i32
%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
;
; SKX-LABEL: insert_dup_mem_v32i16_i32:
; SKX: ## %bb.0:
-; SKX-NEXT: movl (%rdi), %eax
-; SKX-NEXT: vpbroadcastw %eax, %zmm0
+; SKX-NEXT: vpbroadcastw (%rdi), %zmm0
; SKX-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) {
; KNL-LABEL: insert_dup_mem_v32i16_sext_i16:
; KNL: ## %bb.0:
-; KNL-NEXT: movzwl (%rdi), %eax
-; KNL-NEXT: vmovd %eax, %xmm0
-; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
+; KNL-NEXT: vpbroadcastw (%rdi), %ymm0
; KNL-NEXT: vmovdqa %ymm0, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: insert_dup_mem_v32i16_sext_i16:
; SKX: ## %bb.0:
-; SKX-NEXT: movzwl (%rdi), %eax
-; SKX-NEXT: vpbroadcastw %eax, %zmm0
+; SKX-NEXT: vpbroadcastw (%rdi), %zmm0
; SKX-NEXT: retq
%tmp = load i16, i16* %ptr, align 2
%tmp1 = sext i16 %tmp to i32
;
; SKX-LABEL: insert_dup_elt1_mem_v32i16_i32:
; SKX: ## %bb.0:
-; SKX-NEXT: movzwl 2(%rdi), %eax
-; SKX-NEXT: vpbroadcastw %eax, %zmm0
+; SKX-NEXT: vpbroadcastw 2(%rdi), %zmm0
; SKX-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
;
; SKX-LABEL: insert_dup_elt3_mem_v32i16_i32:
; SKX: ## %bb.0:
-; SKX-NEXT: movzwl 2(%rdi), %eax
-; SKX-NEXT: vpbroadcastw %eax, %zmm0
+; SKX-NEXT: vpbroadcastw 2(%rdi), %zmm0
; SKX-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1