-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
;
; AVX512F-LABEL: test5:
; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1}
;
; AVX512F-LABEL: test11a:
; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
;
; AVX512F-LABEL: test11b:
; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
;
; SKX-LABEL: test11c:
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
;
; SKX-LABEL: test11d:
;
; AVX512F-LABEL: test12:
; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
;
; AVX512F-LABEL: mload_constmask_v8f32:
; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512F-NEXT: movw $7, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v8f32:
;
; AVX512F-LABEL: mload_constmask_v8i32:
; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512F-NEXT: movw $135, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v8i32:
declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
+; AVX-LABEL: test_mask_load_16xi8:
+; AVX: ## BB#0:
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: ## implicit-def: %XMM1
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_2
+; AVX-NEXT: ## BB#1: ## %cond.load
+; AVX-NEXT: movzbl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: LBB50_2: ## %else
+; AVX-NEXT: vpextrb $1, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_4
+; AVX-NEXT: ## BB#3: ## %cond.load1
+; AVX-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_4: ## %else2
+; AVX-NEXT: vpextrb $2, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_6
+; AVX-NEXT: ## BB#5: ## %cond.load4
+; AVX-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_6: ## %else5
+; AVX-NEXT: vpextrb $3, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_8
+; AVX-NEXT: ## BB#7: ## %cond.load7
+; AVX-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_8: ## %else8
+; AVX-NEXT: vpextrb $4, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_10
+; AVX-NEXT: ## BB#9: ## %cond.load10
+; AVX-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_10: ## %else11
+; AVX-NEXT: vpextrb $5, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_12
+; AVX-NEXT: ## BB#11: ## %cond.load13
+; AVX-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_12: ## %else14
+; AVX-NEXT: vpextrb $6, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_14
+; AVX-NEXT: ## BB#13: ## %cond.load16
+; AVX-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_14: ## %else17
+; AVX-NEXT: vpextrb $7, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_16
+; AVX-NEXT: ## BB#15: ## %cond.load19
+; AVX-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_16: ## %else20
+; AVX-NEXT: vpextrb $8, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_18
+; AVX-NEXT: ## BB#17: ## %cond.load22
+; AVX-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_18: ## %else23
+; AVX-NEXT: vpextrb $9, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_20
+; AVX-NEXT: ## BB#19: ## %cond.load25
+; AVX-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_20: ## %else26
+; AVX-NEXT: vpextrb $10, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_22
+; AVX-NEXT: ## BB#21: ## %cond.load28
+; AVX-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_22: ## %else29
+; AVX-NEXT: vpextrb $11, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_24
+; AVX-NEXT: ## BB#23: ## %cond.load31
+; AVX-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_24: ## %else32
+; AVX-NEXT: vpextrb $12, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_26
+; AVX-NEXT: ## BB#25: ## %cond.load34
+; AVX-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_26: ## %else35
+; AVX-NEXT: vpextrb $13, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_28
+; AVX-NEXT: ## BB#27: ## %cond.load37
+; AVX-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_28: ## %else38
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_30
+; AVX-NEXT: ## BB#29: ## %cond.load40
+; AVX-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_30: ## %else41
+; AVX-NEXT: vpextrb $15, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_32
+; AVX-NEXT: ## BB#31: ## %cond.load43
+; AVX-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_32: ## %else44
+; AVX-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_16xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: ## implicit-def: %XMM0
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzbl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: LBB50_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_6: ## %else5
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_8: ## %else8
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_10: ## %else11
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_12: ## %else14
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_14: ## %else17
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_16: ## %else20
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_18: ## %else23
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_20: ## %else26
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_22: ## %else29
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_24: ## %else32
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_26: ## %else35
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_28: ## %else38
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_30: ## %else41
+; AVX512F-NEXT: kshiftlw $0, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_32: ## %else44
+; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_mask_load_16xi8:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
+; AVX1-LABEL: test_mask_load_32xi8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: ## implicit-def: %YMM1
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_2
+; AVX1-NEXT: ## BB#1: ## %cond.load
+; AVX1-NEXT: movzbl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: LBB51_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_4
+; AVX1-NEXT: ## BB#3: ## %cond.load1
+; AVX1-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_6
+; AVX1-NEXT: ## BB#5: ## %cond.load4
+; AVX1-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_6: ## %else5
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_8
+; AVX1-NEXT: ## BB#7: ## %cond.load7
+; AVX1-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_8: ## %else8
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_10
+; AVX1-NEXT: ## BB#9: ## %cond.load10
+; AVX1-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_10: ## %else11
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_12
+; AVX1-NEXT: ## BB#11: ## %cond.load13
+; AVX1-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_12: ## %else14
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_14
+; AVX1-NEXT: ## BB#13: ## %cond.load16
+; AVX1-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_14: ## %else17
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_16
+; AVX1-NEXT: ## BB#15: ## %cond.load19
+; AVX1-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_16: ## %else20
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_18
+; AVX1-NEXT: ## BB#17: ## %cond.load22
+; AVX1-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_18: ## %else23
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_20
+; AVX1-NEXT: ## BB#19: ## %cond.load25
+; AVX1-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_20: ## %else26
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_22
+; AVX1-NEXT: ## BB#21: ## %cond.load28
+; AVX1-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_22: ## %else29
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_24
+; AVX1-NEXT: ## BB#23: ## %cond.load31
+; AVX1-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_24: ## %else32
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_26
+; AVX1-NEXT: ## BB#25: ## %cond.load34
+; AVX1-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_26: ## %else35
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_28
+; AVX1-NEXT: ## BB#27: ## %cond.load37
+; AVX1-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_28: ## %else38
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_30
+; AVX1-NEXT: ## BB#29: ## %cond.load40
+; AVX1-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_30: ## %else41
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_32
+; AVX1-NEXT: ## BB#31: ## %cond.load43
+; AVX1-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_32: ## %else44
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $0, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_34
+; AVX1-NEXT: ## BB#33: ## %cond.load46
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $0, 16(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_34: ## %else47
+; AVX1-NEXT: vpextrb $1, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_36
+; AVX1-NEXT: ## BB#35: ## %cond.load49
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $1, 17(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_36: ## %else50
+; AVX1-NEXT: vpextrb $2, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_38
+; AVX1-NEXT: ## BB#37: ## %cond.load52
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $2, 18(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_38: ## %else53
+; AVX1-NEXT: vpextrb $3, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_40
+; AVX1-NEXT: ## BB#39: ## %cond.load55
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $3, 19(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_40: ## %else56
+; AVX1-NEXT: vpextrb $4, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_42
+; AVX1-NEXT: ## BB#41: ## %cond.load58
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $4, 20(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_42: ## %else59
+; AVX1-NEXT: vpextrb $5, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_44
+; AVX1-NEXT: ## BB#43: ## %cond.load61
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $5, 21(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_44: ## %else62
+; AVX1-NEXT: vpextrb $6, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_46
+; AVX1-NEXT: ## BB#45: ## %cond.load64
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $6, 22(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_46: ## %else65
+; AVX1-NEXT: vpextrb $7, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_48
+; AVX1-NEXT: ## BB#47: ## %cond.load67
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $7, 23(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_48: ## %else68
+; AVX1-NEXT: vpextrb $8, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_50
+; AVX1-NEXT: ## BB#49: ## %cond.load70
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $8, 24(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_50: ## %else71
+; AVX1-NEXT: vpextrb $9, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_52
+; AVX1-NEXT: ## BB#51: ## %cond.load73
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $9, 25(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_52: ## %else74
+; AVX1-NEXT: vpextrb $10, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_54
+; AVX1-NEXT: ## BB#53: ## %cond.load76
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $10, 26(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_54: ## %else77
+; AVX1-NEXT: vpextrb $11, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_56
+; AVX1-NEXT: ## BB#55: ## %cond.load79
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $11, 27(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_56: ## %else80
+; AVX1-NEXT: vpextrb $12, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_58
+; AVX1-NEXT: ## BB#57: ## %cond.load82
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $12, 28(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_58: ## %else83
+; AVX1-NEXT: vpextrb $13, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_60
+; AVX1-NEXT: ## BB#59: ## %cond.load85
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $13, 29(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_60: ## %else86
+; AVX1-NEXT: vpextrb $14, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_62
+; AVX1-NEXT: ## BB#61: ## %cond.load88
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $14, 30(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_62: ## %else89
+; AVX1-NEXT: vpextrb $15, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_64
+; AVX1-NEXT: ## BB#63: ## %cond.load91
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $15, 31(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_64: ## %else92
+; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_load_32xi8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: ## implicit-def: %YMM1
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_2
+; AVX2-NEXT: ## BB#1: ## %cond.load
+; AVX2-NEXT: movzbl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: LBB51_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_4
+; AVX2-NEXT: ## BB#3: ## %cond.load1
+; AVX2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_6
+; AVX2-NEXT: ## BB#5: ## %cond.load4
+; AVX2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_6: ## %else5
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_8
+; AVX2-NEXT: ## BB#7: ## %cond.load7
+; AVX2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_8: ## %else8
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_10
+; AVX2-NEXT: ## BB#9: ## %cond.load10
+; AVX2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_10: ## %else11
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_12
+; AVX2-NEXT: ## BB#11: ## %cond.load13
+; AVX2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_12: ## %else14
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_14
+; AVX2-NEXT: ## BB#13: ## %cond.load16
+; AVX2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_14: ## %else17
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_16
+; AVX2-NEXT: ## BB#15: ## %cond.load19
+; AVX2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_16: ## %else20
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_18
+; AVX2-NEXT: ## BB#17: ## %cond.load22
+; AVX2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_18: ## %else23
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_20
+; AVX2-NEXT: ## BB#19: ## %cond.load25
+; AVX2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_20: ## %else26
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_22
+; AVX2-NEXT: ## BB#21: ## %cond.load28
+; AVX2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_22: ## %else29
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_24
+; AVX2-NEXT: ## BB#23: ## %cond.load31
+; AVX2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_24: ## %else32
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_26
+; AVX2-NEXT: ## BB#25: ## %cond.load34
+; AVX2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_26: ## %else35
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_28
+; AVX2-NEXT: ## BB#27: ## %cond.load37
+; AVX2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_28: ## %else38
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_30
+; AVX2-NEXT: ## BB#29: ## %cond.load40
+; AVX2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_30: ## %else41
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_32
+; AVX2-NEXT: ## BB#31: ## %cond.load43
+; AVX2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_32: ## %else44
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_34
+; AVX2-NEXT: ## BB#33: ## %cond.load46
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $0, 16(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_34: ## %else47
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_36
+; AVX2-NEXT: ## BB#35: ## %cond.load49
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $1, 17(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_36: ## %else50
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_38
+; AVX2-NEXT: ## BB#37: ## %cond.load52
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $2, 18(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_38: ## %else53
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_40
+; AVX2-NEXT: ## BB#39: ## %cond.load55
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $3, 19(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_40: ## %else56
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_42
+; AVX2-NEXT: ## BB#41: ## %cond.load58
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $4, 20(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_42: ## %else59
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_44
+; AVX2-NEXT: ## BB#43: ## %cond.load61
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $5, 21(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_44: ## %else62
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_46
+; AVX2-NEXT: ## BB#45: ## %cond.load64
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $6, 22(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_46: ## %else65
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_48
+; AVX2-NEXT: ## BB#47: ## %cond.load67
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $7, 23(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_48: ## %else68
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_50
+; AVX2-NEXT: ## BB#49: ## %cond.load70
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $8, 24(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_50: ## %else71
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_52
+; AVX2-NEXT: ## BB#51: ## %cond.load73
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $9, 25(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_52: ## %else74
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_54
+; AVX2-NEXT: ## BB#53: ## %cond.load76
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $10, 26(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_54: ## %else77
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_56
+; AVX2-NEXT: ## BB#55: ## %cond.load79
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $11, 27(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_56: ## %else80
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_58
+; AVX2-NEXT: ## BB#57: ## %cond.load82
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $12, 28(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_58: ## %else83
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_60
+; AVX2-NEXT: ## BB#59: ## %cond.load85
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $13, 29(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_60: ## %else86
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_62
+; AVX2-NEXT: ## BB#61: ## %cond.load88
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $14, 30(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_62: ## %else89
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_64
+; AVX2-NEXT: ## BB#63: ## %cond.load91
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrb $15, 31(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_64: ## %else92
+; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_32xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: ## implicit-def: %YMM1
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzbl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm1
+; AVX512F-NEXT: LBB51_2: ## %else
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_4: ## %else2
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_6: ## %else5
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_8: ## %else8
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_10: ## %else11
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_12: ## %else14
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_14: ## %else17
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_16: ## %else20
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_18: ## %else23
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_20: ## %else26
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_22: ## %else29
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_24: ## %else32
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_26: ## %else35
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_28: ## %else38
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_30: ## %else41
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_32: ## %else44
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_34
+; AVX512F-NEXT: ## BB#33: ## %cond.load46
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_34: ## %else47
+; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_36
+; AVX512F-NEXT: ## BB#35: ## %cond.load49
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_36: ## %else50
+; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_38
+; AVX512F-NEXT: ## BB#37: ## %cond.load52
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_38: ## %else53
+; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_40
+; AVX512F-NEXT: ## BB#39: ## %cond.load55
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_40: ## %else56
+; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_42
+; AVX512F-NEXT: ## BB#41: ## %cond.load58
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_42: ## %else59
+; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_44
+; AVX512F-NEXT: ## BB#43: ## %cond.load61
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_44: ## %else62
+; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_46
+; AVX512F-NEXT: ## BB#45: ## %cond.load64
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_46: ## %else65
+; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_48
+; AVX512F-NEXT: ## BB#47: ## %cond.load67
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_48: ## %else68
+; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_50
+; AVX512F-NEXT: ## BB#49: ## %cond.load70
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_50: ## %else71
+; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_52
+; AVX512F-NEXT: ## BB#51: ## %cond.load73
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_52: ## %else74
+; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_54
+; AVX512F-NEXT: ## BB#53: ## %cond.load76
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_54: ## %else77
+; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_56
+; AVX512F-NEXT: ## BB#55: ## %cond.load79
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_56: ## %else80
+; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_58
+; AVX512F-NEXT: ## BB#57: ## %cond.load82
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_58: ## %else83
+; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_60
+; AVX512F-NEXT: ## BB#59: ## %cond.load85
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_60: ## %else86
+; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_62
+; AVX512F-NEXT: ## BB#61: ## %cond.load88
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $14, 30(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_62: ## %else89
+; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_64
+; AVX512F-NEXT: ## BB#63: ## %cond.load91
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $15, 31(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_64: ## %else92
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_mask_load_32xi8:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
declare <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) {
+; AVX1-LABEL: test_mask_load_64xi8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: Ltmp3:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: Ltmp4:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: Ltmp5:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: Ltmp6:
+; AVX1-NEXT: .cfi_def_cfa_offset 40
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: Ltmp7:
+; AVX1-NEXT: .cfi_def_cfa_offset 48
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: Ltmp8:
+; AVX1-NEXT: .cfi_def_cfa_offset 56
+; AVX1-NEXT: pushq %rax
+; AVX1-NEXT: Ltmp9:
+; AVX1-NEXT: .cfi_def_cfa_offset 64
+; AVX1-NEXT: Ltmp10:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: Ltmp11:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: Ltmp12:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: Ltmp13:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: Ltmp14:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: Ltmp15:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movl %edi, %r13d
+; AVX1-NEXT: testb $1, %dil
+; AVX1-NEXT: je LBB52_2
+; AVX1-NEXT: ## BB#1: ## %cond.load
+; AVX1-NEXT: movzbl (%rax), %ebp
+; AVX1-NEXT: vmovd %ebp, %xmm9
+; AVX1-NEXT: LBB52_2: ## %else
+; AVX1-NEXT: testb $1, %sil
+; AVX1-NEXT: je LBB52_4
+; AVX1-NEXT: ## BB#3: ## %cond.load1
+; AVX1-NEXT: vpinsrb $1, 1(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_4: ## %else2
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB52_6
+; AVX1-NEXT: ## BB#5: ## %cond.load4
+; AVX1-NEXT: vpinsrb $2, 2(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_6: ## %else5
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB52_8
+; AVX1-NEXT: ## BB#7: ## %cond.load7
+; AVX1-NEXT: vpinsrb $3, 3(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_8: ## %else8
+; AVX1-NEXT: testb $1, %r8b
+; AVX1-NEXT: je LBB52_10
+; AVX1-NEXT: ## BB#9: ## %cond.load10
+; AVX1-NEXT: vpinsrb $4, 4(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_10: ## %else11
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r10b
+; AVX1-NEXT: testb $1, %r9b
+; AVX1-NEXT: je LBB52_12
+; AVX1-NEXT: ## BB#11: ## %cond.load13
+; AVX1-NEXT: vpinsrb $5, 5(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_12: ## %else14
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r11b
+; AVX1-NEXT: testb $1, %r10b
+; AVX1-NEXT: je LBB52_14
+; AVX1-NEXT: ## BB#13: ## %cond.load16
+; AVX1-NEXT: vpinsrb $6, 6(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_14: ## %else17
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r14b
+; AVX1-NEXT: testb $1, %r11b
+; AVX1-NEXT: je LBB52_16
+; AVX1-NEXT: ## BB#15: ## %cond.load19
+; AVX1-NEXT: vpinsrb $7, 7(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_16: ## %else20
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r15b
+; AVX1-NEXT: testb $1, %r14b
+; AVX1-NEXT: je LBB52_18
+; AVX1-NEXT: ## BB#17: ## %cond.load22
+; AVX1-NEXT: vpinsrb $8, 8(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_18: ## %else23
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r12b
+; AVX1-NEXT: testb $1, %r15b
+; AVX1-NEXT: je LBB52_20
+; AVX1-NEXT: ## BB#19: ## %cond.load25
+; AVX1-NEXT: vpinsrb $9, 9(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_20: ## %else26
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dil
+; AVX1-NEXT: testb $1, %r12b
+; AVX1-NEXT: je LBB52_22
+; AVX1-NEXT: ## BB#21: ## %cond.load28
+; AVX1-NEXT: vpinsrb $10, 10(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_22: ## %else29
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %bpl
+; AVX1-NEXT: testb $1, %dil
+; AVX1-NEXT: je LBB52_24
+; AVX1-NEXT: ## BB#23: ## %cond.load31
+; AVX1-NEXT: vpinsrb $11, 11(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_24: ## %else32
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %bl
+; AVX1-NEXT: testb $1, %bpl
+; AVX1-NEXT: je LBB52_26
+; AVX1-NEXT: ## BB#25: ## %cond.load34
+; AVX1-NEXT: vpinsrb $12, 12(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_26: ## %else35
+; AVX1-NEXT: testb $1, %bl
+; AVX1-NEXT: je LBB52_28
+; AVX1-NEXT: ## BB#27: ## %cond.load37
+; AVX1-NEXT: vpinsrb $13, 13(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_28: ## %else38
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_30
+; AVX1-NEXT: ## BB#29: ## %cond.load40
+; AVX1-NEXT: vpinsrb $14, 14(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_30: ## %else41
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_32
+; AVX1-NEXT: ## BB#31: ## %cond.load43
+; AVX1-NEXT: vpinsrb $15, 15(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_32: ## %else44
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_34
+; AVX1-NEXT: ## BB#33: ## %cond.load46
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $0, 16(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_34: ## %else47
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_36
+; AVX1-NEXT: ## BB#35: ## %cond.load49
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $1, 17(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_36: ## %else50
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_38
+; AVX1-NEXT: ## BB#37: ## %cond.load52
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $2, 18(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_38: ## %else53
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_40
+; AVX1-NEXT: ## BB#39: ## %cond.load55
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $3, 19(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_40: ## %else56
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_42
+; AVX1-NEXT: ## BB#41: ## %cond.load58
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $4, 20(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_42: ## %else59
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_44
+; AVX1-NEXT: ## BB#43: ## %cond.load61
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $5, 21(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_44: ## %else62
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_46
+; AVX1-NEXT: ## BB#45: ## %cond.load64
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $6, 22(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_46: ## %else65
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_48
+; AVX1-NEXT: ## BB#47: ## %cond.load67
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $7, 23(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_48: ## %else68
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_50
+; AVX1-NEXT: ## BB#49: ## %cond.load70
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $8, 24(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_50: ## %else71
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_52
+; AVX1-NEXT: ## BB#51: ## %cond.load73
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $9, 25(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_52: ## %else74
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_54
+; AVX1-NEXT: ## BB#53: ## %cond.load76
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $10, 26(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_54: ## %else77
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_56
+; AVX1-NEXT: ## BB#55: ## %cond.load79
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $11, 27(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_56: ## %else80
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_58
+; AVX1-NEXT: ## BB#57: ## %cond.load82
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $12, 28(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_58: ## %else83
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_60
+; AVX1-NEXT: ## BB#59: ## %cond.load85
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $13, 29(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_60: ## %else86
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_62
+; AVX1-NEXT: ## BB#61: ## %cond.load88
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $14, 30(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_62: ## %else89
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_64
+; AVX1-NEXT: ## BB#63: ## %cond.load91
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $15, 31(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_64: ## %else92
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_66
+; AVX1-NEXT: ## BB#65: ## %cond.load94
+; AVX1-NEXT: vpinsrb $0, 32(%rax), %xmm0, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: LBB52_66: ## %else95
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_68
+; AVX1-NEXT: ## BB#67: ## %cond.load97
+; AVX1-NEXT: vpinsrb $1, 33(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_68: ## %else98
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_70
+; AVX1-NEXT: ## BB#69: ## %cond.load100
+; AVX1-NEXT: vpinsrb $2, 34(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_70: ## %else101
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_72
+; AVX1-NEXT: ## BB#71: ## %cond.load103
+; AVX1-NEXT: vpinsrb $3, 35(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_72: ## %else104
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_74
+; AVX1-NEXT: ## BB#73: ## %cond.load106
+; AVX1-NEXT: vpinsrb $4, 36(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_74: ## %else107
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_76
+; AVX1-NEXT: ## BB#75: ## %cond.load109
+; AVX1-NEXT: vpinsrb $5, 37(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_76: ## %else110
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_78
+; AVX1-NEXT: ## BB#77: ## %cond.load112
+; AVX1-NEXT: vpinsrb $6, 38(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_78: ## %else113
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_80
+; AVX1-NEXT: ## BB#79: ## %cond.load115
+; AVX1-NEXT: vpinsrb $7, 39(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_80: ## %else116
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_82
+; AVX1-NEXT: ## BB#81: ## %cond.load118
+; AVX1-NEXT: vpinsrb $8, 40(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_82: ## %else119
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_84
+; AVX1-NEXT: ## BB#83: ## %cond.load121
+; AVX1-NEXT: vpinsrb $9, 41(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_84: ## %else122
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_86
+; AVX1-NEXT: ## BB#85: ## %cond.load124
+; AVX1-NEXT: vpinsrb $10, 42(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_86: ## %else125
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_88
+; AVX1-NEXT: ## BB#87: ## %cond.load127
+; AVX1-NEXT: vpinsrb $11, 43(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_88: ## %else128
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_90
+; AVX1-NEXT: ## BB#89: ## %cond.load130
+; AVX1-NEXT: vpinsrb $12, 44(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_90: ## %else131
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_92
+; AVX1-NEXT: ## BB#91: ## %cond.load133
+; AVX1-NEXT: vpinsrb $13, 45(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_92: ## %else134
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_94
+; AVX1-NEXT: ## BB#93: ## %cond.load136
+; AVX1-NEXT: vpinsrb $14, 46(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_94: ## %else137
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_96
+; AVX1-NEXT: ## BB#95: ## %cond.load139
+; AVX1-NEXT: vpinsrb $15, 47(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_96: ## %else140
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_98
+; AVX1-NEXT: ## BB#97: ## %cond.load142
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $0, 48(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_98: ## %else143
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_100
+; AVX1-NEXT: ## BB#99: ## %cond.load145
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $1, 49(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_100: ## %else146
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_102
+; AVX1-NEXT: ## BB#101: ## %cond.load148
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $2, 50(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_102: ## %else149
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_104
+; AVX1-NEXT: ## BB#103: ## %cond.load151
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $3, 51(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_104: ## %else152
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_106
+; AVX1-NEXT: ## BB#105: ## %cond.load154
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $4, 52(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_106: ## %else155
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_108
+; AVX1-NEXT: ## BB#107: ## %cond.load157
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $5, 53(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_108: ## %else158
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_110
+; AVX1-NEXT: ## BB#109: ## %cond.load160
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $6, 54(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_110: ## %else161
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_112
+; AVX1-NEXT: ## BB#111: ## %cond.load163
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $7, 55(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_112: ## %else164
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_114
+; AVX1-NEXT: ## BB#113: ## %cond.load166
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $8, 56(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_114: ## %else167
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_116
+; AVX1-NEXT: ## BB#115: ## %cond.load169
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $9, 57(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_116: ## %else170
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_118
+; AVX1-NEXT: ## BB#117: ## %cond.load172
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $10, 58(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_118: ## %else173
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_120
+; AVX1-NEXT: ## BB#119: ## %cond.load175
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $11, 59(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_120: ## %else176
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_122
+; AVX1-NEXT: ## BB#121: ## %cond.load178
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $12, 60(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_122: ## %else179
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_124
+; AVX1-NEXT: ## BB#123: ## %cond.load181
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $13, 61(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_124: ## %else182
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_126
+; AVX1-NEXT: ## BB#125: ## %cond.load184
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $14, 62(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_126: ## %else185
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl %r9d, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movl %r8d, (%rsp) ## 4-byte Spill
+; AVX1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: je LBB52_128
+; AVX1-NEXT: ## BB#127: ## %cond.load187
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $15, 63(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_128: ## %else188
+; AVX1-NEXT: movzbl %r10b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %r11b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %r14b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %r15b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %r12b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %dil, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %bpl, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %bl, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX1-NEXT: movzbl %r13b, %r13d
+; AVX1-NEXT: vmovd %r13d, %xmm4
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %edi ## 4-byte Reload
+; AVX1-NEXT: movzbl %dil, %ebp
+; AVX1-NEXT: vpinsrb $1, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX1-NEXT: movzbl %bpl, %ebp
+; AVX1-NEXT: vpinsrb $2, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX1-NEXT: movzbl %bpl, %ebp
+; AVX1-NEXT: vpinsrb $3, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: movl (%rsp), %ebp ## 4-byte Reload
+; AVX1-NEXT: movzbl %bpl, %ebp
+; AVX1-NEXT: vpinsrb $4, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX1-NEXT: movzbl %bpl, %ebp
+; AVX1-NEXT: vpinsrb $5, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vmovd -{{[0-9]+}}(%rsp), %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: ## xmm5 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm5, %xmm8 ## 4-byte Folded Reload
+; AVX1-NEXT: vmovd -{{[0-9]+}}(%rsp), %xmm6 ## 4-byte Folded Reload
+; AVX1-NEXT: ## xmm6 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $2, %r12d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $3, %r15d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $4, %r14d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $5, %r11d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $6, %r8d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $7, %edx, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; AVX1-NEXT: vpinsrb $10, %esi, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX1-NEXT: vpinsrb $11, %r9d, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX1-NEXT: vpinsrb $12, %r10d, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; AVX1-NEXT: vpinsrb $13, %ebx, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: vpinsrb $14, %r13d, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; AVX1-NEXT: vpinsrb $15, %r14d, %xmm6, %xmm10
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX1-NEXT: vmovd %edi, %xmm7
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
+; AVX1-NEXT: vpinsrb $1, %r11d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $2, %r15d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $3, %r12d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $4, %r8d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $5, %ecx, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $6, %r9d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $7, %esi, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $8, %r10d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $10, %r13d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $11, %edx, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $12, %r14d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $13, %ebx, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $14, %edi, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $15, %ebp, %xmm7, %xmm7
+; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpsllw $7, %xmm8, %xmm6
+; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX1-NEXT: vandnps %ymm0, %ymm4, %ymm0
+; AVX1-NEXT: vandps %ymm4, %ymm9, %ymm4
+; AVX1-NEXT: vorps %ymm0, %ymm4, %ymm0
+; AVX1-NEXT: vpsllw $7, %xmm10, %xmm4
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpsllw $7, %xmm7, %xmm6
+; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpcmpgtb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm3, %ymm2
+; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: addq $8, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_load_64xi8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: Ltmp3:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: Ltmp4:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: Ltmp5:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: Ltmp6:
+; AVX2-NEXT: .cfi_def_cfa_offset 40
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: Ltmp7:
+; AVX2-NEXT: .cfi_def_cfa_offset 48
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: Ltmp8:
+; AVX2-NEXT: .cfi_def_cfa_offset 56
+; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: Ltmp9:
+; AVX2-NEXT: .cfi_def_cfa_offset 64
+; AVX2-NEXT: Ltmp10:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: Ltmp11:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: Ltmp12:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: Ltmp13:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: Ltmp14:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: Ltmp15:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: je LBB52_2
+; AVX2-NEXT: ## BB#1: ## %cond.load
+; AVX2-NEXT: movzbl (%rax), %ebp
+; AVX2-NEXT: vmovd %ebp, %xmm2
+; AVX2-NEXT: LBB52_2: ## %else
+; AVX2-NEXT: testb $1, %sil
+; AVX2-NEXT: je LBB52_4
+; AVX2-NEXT: ## BB#3: ## %cond.load1
+; AVX2-NEXT: vpinsrb $1, 1(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_4: ## %else2
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB52_6
+; AVX2-NEXT: ## BB#5: ## %cond.load4
+; AVX2-NEXT: vpinsrb $2, 2(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_6: ## %else5
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB52_8
+; AVX2-NEXT: ## BB#7: ## %cond.load7
+; AVX2-NEXT: vpinsrb $3, 3(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_8: ## %else8
+; AVX2-NEXT: testb $1, %r8b
+; AVX2-NEXT: je LBB52_10
+; AVX2-NEXT: ## BB#9: ## %cond.load10
+; AVX2-NEXT: vpinsrb $4, 4(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_10: ## %else11
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r10b
+; AVX2-NEXT: testb $1, %r9b
+; AVX2-NEXT: je LBB52_12
+; AVX2-NEXT: ## BB#11: ## %cond.load13
+; AVX2-NEXT: vpinsrb $5, 5(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_12: ## %else14
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r11b
+; AVX2-NEXT: testb $1, %r10b
+; AVX2-NEXT: je LBB52_14
+; AVX2-NEXT: ## BB#13: ## %cond.load16
+; AVX2-NEXT: vpinsrb $6, 6(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_14: ## %else17
+; AVX2-NEXT: testb $1, %r11b
+; AVX2-NEXT: je LBB52_16
+; AVX2-NEXT: ## BB#15: ## %cond.load19
+; AVX2-NEXT: vpinsrb $7, 7(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_16: ## %else20
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_18
+; AVX2-NEXT: ## BB#17: ## %cond.load22
+; AVX2-NEXT: vpinsrb $8, 8(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_18: ## %else23
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_20
+; AVX2-NEXT: ## BB#19: ## %cond.load25
+; AVX2-NEXT: vpinsrb $9, 9(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_20: ## %else26
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_22
+; AVX2-NEXT: ## BB#21: ## %cond.load28
+; AVX2-NEXT: vpinsrb $10, 10(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_22: ## %else29
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %bpl
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_24
+; AVX2-NEXT: ## BB#23: ## %cond.load31
+; AVX2-NEXT: vpinsrb $11, 11(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_24: ## %else32
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %bl
+; AVX2-NEXT: testb $1, %bpl
+; AVX2-NEXT: je LBB52_26
+; AVX2-NEXT: ## BB#25: ## %cond.load34
+; AVX2-NEXT: vpinsrb $12, 12(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_26: ## %else35
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r14b
+; AVX2-NEXT: testb $1, %bl
+; AVX2-NEXT: je LBB52_28
+; AVX2-NEXT: ## BB#27: ## %cond.load37
+; AVX2-NEXT: vpinsrb $13, 13(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_28: ## %else38
+; AVX2-NEXT: testb $1, %r14b
+; AVX2-NEXT: je LBB52_30
+; AVX2-NEXT: ## BB#29: ## %cond.load40
+; AVX2-NEXT: vpinsrb $14, 14(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_30: ## %else41
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r13b
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_32
+; AVX2-NEXT: ## BB#31: ## %cond.load43
+; AVX2-NEXT: vpinsrb $15, 15(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_32: ## %else44
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r12b
+; AVX2-NEXT: testb $1, %r13b
+; AVX2-NEXT: je LBB52_34
+; AVX2-NEXT: ## BB#33: ## %cond.load46
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $0, 16(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_34: ## %else47
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r15b
+; AVX2-NEXT: testb $1, %r12b
+; AVX2-NEXT: je LBB52_36
+; AVX2-NEXT: ## BB#35: ## %cond.load49
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $1, 17(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_36: ## %else50
+; AVX2-NEXT: testb $1, %r15b
+; AVX2-NEXT: je LBB52_38
+; AVX2-NEXT: ## BB#37: ## %cond.load52
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $2, 18(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_38: ## %else53
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_40
+; AVX2-NEXT: ## BB#39: ## %cond.load55
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $3, 19(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_40: ## %else56
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_42
+; AVX2-NEXT: ## BB#41: ## %cond.load58
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $4, 20(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_42: ## %else59
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_44
+; AVX2-NEXT: ## BB#43: ## %cond.load61
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $5, 21(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_44: ## %else62
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_46
+; AVX2-NEXT: ## BB#45: ## %cond.load64
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $6, 22(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_46: ## %else65
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_48
+; AVX2-NEXT: ## BB#47: ## %cond.load67
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $7, 23(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_48: ## %else68
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_50
+; AVX2-NEXT: ## BB#49: ## %cond.load70
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $8, 24(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_50: ## %else71
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_52
+; AVX2-NEXT: ## BB#51: ## %cond.load73
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $9, 25(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_52: ## %else74
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_54
+; AVX2-NEXT: ## BB#53: ## %cond.load76
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $10, 26(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_54: ## %else77
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_56
+; AVX2-NEXT: ## BB#55: ## %cond.load79
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $11, 27(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_56: ## %else80
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_58
+; AVX2-NEXT: ## BB#57: ## %cond.load82
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $12, 28(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_58: ## %else83
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_60
+; AVX2-NEXT: ## BB#59: ## %cond.load85
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $13, 29(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_60: ## %else86
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_62
+; AVX2-NEXT: ## BB#61: ## %cond.load88
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $14, 30(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_62: ## %else89
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_64
+; AVX2-NEXT: ## BB#63: ## %cond.load91
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $15, 31(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_64: ## %else92
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_66
+; AVX2-NEXT: ## BB#65: ## %cond.load94
+; AVX2-NEXT: vpinsrb $0, 32(%rax), %xmm0, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: LBB52_66: ## %else95
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_68
+; AVX2-NEXT: ## BB#67: ## %cond.load97
+; AVX2-NEXT: vpinsrb $1, 33(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_68: ## %else98
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_70
+; AVX2-NEXT: ## BB#69: ## %cond.load100
+; AVX2-NEXT: vpinsrb $2, 34(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_70: ## %else101
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_72
+; AVX2-NEXT: ## BB#71: ## %cond.load103
+; AVX2-NEXT: vpinsrb $3, 35(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_72: ## %else104
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_74
+; AVX2-NEXT: ## BB#73: ## %cond.load106
+; AVX2-NEXT: vpinsrb $4, 36(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_74: ## %else107
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_76
+; AVX2-NEXT: ## BB#75: ## %cond.load109
+; AVX2-NEXT: vpinsrb $5, 37(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_76: ## %else110
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_78
+; AVX2-NEXT: ## BB#77: ## %cond.load112
+; AVX2-NEXT: vpinsrb $6, 38(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_78: ## %else113
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_80
+; AVX2-NEXT: ## BB#79: ## %cond.load115
+; AVX2-NEXT: vpinsrb $7, 39(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_80: ## %else116
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_82
+; AVX2-NEXT: ## BB#81: ## %cond.load118
+; AVX2-NEXT: vpinsrb $8, 40(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_82: ## %else119
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_84
+; AVX2-NEXT: ## BB#83: ## %cond.load121
+; AVX2-NEXT: vpinsrb $9, 41(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_84: ## %else122
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_86
+; AVX2-NEXT: ## BB#85: ## %cond.load124
+; AVX2-NEXT: vpinsrb $10, 42(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_86: ## %else125
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_88
+; AVX2-NEXT: ## BB#87: ## %cond.load127
+; AVX2-NEXT: vpinsrb $11, 43(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_88: ## %else128
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_90
+; AVX2-NEXT: ## BB#89: ## %cond.load130
+; AVX2-NEXT: vpinsrb $12, 44(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_90: ## %else131
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_92
+; AVX2-NEXT: ## BB#91: ## %cond.load133
+; AVX2-NEXT: vpinsrb $13, 45(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_92: ## %else134
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_94
+; AVX2-NEXT: ## BB#93: ## %cond.load136
+; AVX2-NEXT: vpinsrb $14, 46(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_94: ## %else137
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_96
+; AVX2-NEXT: ## BB#95: ## %cond.load139
+; AVX2-NEXT: vpinsrb $15, 47(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_96: ## %else140
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_98
+; AVX2-NEXT: ## BB#97: ## %cond.load142
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $0, 48(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_98: ## %else143
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_100
+; AVX2-NEXT: ## BB#99: ## %cond.load145
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $1, 49(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_100: ## %else146
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_102
+; AVX2-NEXT: ## BB#101: ## %cond.load148
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $2, 50(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_102: ## %else149
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_104
+; AVX2-NEXT: ## BB#103: ## %cond.load151
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $3, 51(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_104: ## %else152
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_106
+; AVX2-NEXT: ## BB#105: ## %cond.load154
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $4, 52(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_106: ## %else155
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_108
+; AVX2-NEXT: ## BB#107: ## %cond.load157
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $5, 53(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_108: ## %else158
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_110
+; AVX2-NEXT: ## BB#109: ## %cond.load160
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $6, 54(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_110: ## %else161
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_112
+; AVX2-NEXT: ## BB#111: ## %cond.load163
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $7, 55(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_112: ## %else164
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_114
+; AVX2-NEXT: ## BB#113: ## %cond.load166
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $8, 56(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_114: ## %else167
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_116
+; AVX2-NEXT: ## BB#115: ## %cond.load169
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $9, 57(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_116: ## %else170
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_118
+; AVX2-NEXT: ## BB#117: ## %cond.load172
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $10, 58(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_118: ## %else173
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_120
+; AVX2-NEXT: ## BB#119: ## %cond.load175
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $11, 59(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_120: ## %else176
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_122
+; AVX2-NEXT: ## BB#121: ## %cond.load178
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $12, 60(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_122: ## %else179
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_124
+; AVX2-NEXT: ## BB#123: ## %cond.load181
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $13, 61(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_124: ## %else182
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: jne LBB52_126
+; AVX2-NEXT: ## BB#125:
+; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: jmp LBB52_127
+; AVX2-NEXT: LBB52_126: ## %cond.load184
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: vpinsrb $14, 62(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_127: ## %else185
+; AVX2-NEXT: movl %ebp, %eax
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %r9d, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl %r8d, (%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl %esi, %ebp
+; AVX2-NEXT: je LBB52_129
+; AVX2-NEXT: ## BB#128: ## %cond.load187
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $15, 63(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_129: ## %else188
+; AVX2-NEXT: movzbl %r10b, %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r11b, %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %bl, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r14b, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r12b, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r13b, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r15b, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; AVX2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %edi ## 4-byte Reload
+; AVX2-NEXT: movzbl %dil, %r13d
+; AVX2-NEXT: vmovd %r13d, %xmm4
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $1, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $2, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $3, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: movl (%rsp), %ebp ## 4-byte Reload
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $4, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $5, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vmovd -{{[0-9]+}}(%rsp), %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: ## xmm5 = mem[0],zero,zero,zero
+; AVX2-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vmovd %r12d, %xmm6
+; AVX2-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $2, %r15d, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $3, %r14d, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $4, %ebx, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $5, %r11d, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $6, %r9d, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $7, %esi, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX2-NEXT: vpinsrb $9, %ecx, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX2-NEXT: vpinsrb $10, %edx, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX2-NEXT: vpinsrb $11, %r8d, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: vpinsrb $12, %r10d, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX2-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; AVX2-NEXT: vpinsrb $15, %r15d, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; AVX2-NEXT: vmovd %r12d, %xmm7
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; AVX2-NEXT: vpinsrb $1, %r9d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $2, %r11d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $3, %r14d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $4, %r13d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $5, %ecx, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $6, %r8d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $7, %ebx, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $8, %r10d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $9, %ebp, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $11, %edi, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $12, %r15d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $13, %esi, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $14, %r12d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $15, %edx, %xmm7, %xmm7
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
+; AVX2-NEXT: vpsllw $7, %ymm4, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm2
+; AVX2-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_64xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: Ltmp0:
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: pushq %r15
+; AVX512F-NEXT: Ltmp1:
+; AVX512F-NEXT: .cfi_def_cfa_offset 24
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: Ltmp2:
+; AVX512F-NEXT: .cfi_def_cfa_offset 32
+; AVX512F-NEXT: pushq %r13
+; AVX512F-NEXT: Ltmp3:
+; AVX512F-NEXT: .cfi_def_cfa_offset 40
+; AVX512F-NEXT: pushq %r12
+; AVX512F-NEXT: Ltmp4:
+; AVX512F-NEXT: .cfi_def_cfa_offset 48
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: Ltmp5:
+; AVX512F-NEXT: .cfi_def_cfa_offset 56
+; AVX512F-NEXT: subq $76, %rsp
+; AVX512F-NEXT: Ltmp6:
+; AVX512F-NEXT: .cfi_def_cfa_offset 132
+; AVX512F-NEXT: Ltmp7:
+; AVX512F-NEXT: .cfi_offset %rbx, -56
+; AVX512F-NEXT: Ltmp8:
+; AVX512F-NEXT: .cfi_offset %r12, -48
+; AVX512F-NEXT: Ltmp9:
+; AVX512F-NEXT: .cfi_offset %r13, -40
+; AVX512F-NEXT: Ltmp10:
+; AVX512F-NEXT: .cfi_offset %r14, -32
+; AVX512F-NEXT: Ltmp11:
+; AVX512F-NEXT: .cfi_offset %r15, -24
+; AVX512F-NEXT: Ltmp12:
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzbl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: LBB52_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_6: ## %else5
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_8: ## %else8
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_10: ## %else11
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_12: ## %else14
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_14: ## %else17
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_16: ## %else20
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_18: ## %else23
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, (%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_20: ## %else26
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_22: ## %else29
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_24: ## %else32
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_26: ## %else35
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_28: ## %else38
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_30: ## %else41
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_32: ## %else44
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_34
+; AVX512F-NEXT: ## BB#33: ## %cond.load46
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_34: ## %else47
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_36
+; AVX512F-NEXT: ## BB#35: ## %cond.load49
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_36: ## %else50
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_38
+; AVX512F-NEXT: ## BB#37: ## %cond.load52
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_38: ## %else53
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_40
+; AVX512F-NEXT: ## BB#39: ## %cond.load55
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_40: ## %else56
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_42
+; AVX512F-NEXT: ## BB#41: ## %cond.load58
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_42: ## %else59
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_44
+; AVX512F-NEXT: ## BB#43: ## %cond.load61
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_44: ## %else62
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_46
+; AVX512F-NEXT: ## BB#45: ## %cond.load64
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_46: ## %else65
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_48
+; AVX512F-NEXT: ## BB#47: ## %cond.load67
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_48: ## %else68
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_50
+; AVX512F-NEXT: ## BB#49: ## %cond.load70
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_50: ## %else71
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_52
+; AVX512F-NEXT: ## BB#51: ## %cond.load73
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_52: ## %else74
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_54
+; AVX512F-NEXT: ## BB#53: ## %cond.load76
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_54: ## %else77
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_56
+; AVX512F-NEXT: ## BB#55: ## %cond.load79
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_56: ## %else80
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_58
+; AVX512F-NEXT: ## BB#57: ## %cond.load82
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_58: ## %else83
+; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm1
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_60
+; AVX512F-NEXT: ## BB#59: ## %cond.load85
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_60: ## %else86
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_62
+; AVX512F-NEXT: ## BB#61: ## %cond.load88
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpinsrb $14, 30(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_62: ## %else89
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kshiftlw $0, %k1, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_64
+; AVX512F-NEXT: ## BB#63: ## %cond.load91
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $15, 31(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_64: ## %else92
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_66
+; AVX512F-NEXT: ## BB#65: ## %cond.load94
+; AVX512F-NEXT: vpinsrb $0, 32(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_66: ## %else95
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_68
+; AVX512F-NEXT: ## BB#67: ## %cond.load97
+; AVX512F-NEXT: vpinsrb $1, 33(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_68: ## %else98
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_70
+; AVX512F-NEXT: ## BB#69: ## %cond.load100
+; AVX512F-NEXT: vpinsrb $2, 34(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_70: ## %else101
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_72
+; AVX512F-NEXT: ## BB#71: ## %cond.load103
+; AVX512F-NEXT: vpinsrb $3, 35(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_72: ## %else104
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_74
+; AVX512F-NEXT: ## BB#73: ## %cond.load106
+; AVX512F-NEXT: vpinsrb $4, 36(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_74: ## %else107
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_76
+; AVX512F-NEXT: ## BB#75: ## %cond.load109
+; AVX512F-NEXT: vpinsrb $5, 37(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_76: ## %else110
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_78
+; AVX512F-NEXT: ## BB#77: ## %cond.load112
+; AVX512F-NEXT: vpinsrb $6, 38(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_78: ## %else113
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_80
+; AVX512F-NEXT: ## BB#79: ## %cond.load115
+; AVX512F-NEXT: vpinsrb $7, 39(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_80: ## %else116
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_82
+; AVX512F-NEXT: ## BB#81: ## %cond.load118
+; AVX512F-NEXT: vpinsrb $8, 40(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_82: ## %else119
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_84
+; AVX512F-NEXT: ## BB#83: ## %cond.load121
+; AVX512F-NEXT: vpinsrb $9, 41(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_84: ## %else122
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_86
+; AVX512F-NEXT: ## BB#85: ## %cond.load124
+; AVX512F-NEXT: vpinsrb $10, 42(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_86: ## %else125
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_88
+; AVX512F-NEXT: ## BB#87: ## %cond.load127
+; AVX512F-NEXT: vpinsrb $11, 43(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_88: ## %else128
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_90
+; AVX512F-NEXT: ## BB#89: ## %cond.load130
+; AVX512F-NEXT: vpinsrb $12, 44(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_90: ## %else131
+; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm2
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_92
+; AVX512F-NEXT: ## BB#91: ## %cond.load133
+; AVX512F-NEXT: vpinsrb $13, 45(%rdi), %xmm1, %xmm3
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_92: ## %else134
+; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_94
+; AVX512F-NEXT: ## BB#93: ## %cond.load136
+; AVX512F-NEXT: vpinsrb $14, 46(%rdi), %xmm1, %xmm3
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_94: ## %else137
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_96
+; AVX512F-NEXT: ## BB#95: ## %cond.load139
+; AVX512F-NEXT: vpinsrb $15, 47(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_96: ## %else140
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_98
+; AVX512F-NEXT: ## BB#97: ## %cond.load142
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $0, 48(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_98: ## %else143
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_100
+; AVX512F-NEXT: ## BB#99: ## %cond.load145
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $1, 49(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_100: ## %else146
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_102
+; AVX512F-NEXT: ## BB#101: ## %cond.load148
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $2, 50(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_102: ## %else149
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_104
+; AVX512F-NEXT: ## BB#103: ## %cond.load151
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $3, 51(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_104: ## %else152
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_106
+; AVX512F-NEXT: ## BB#105: ## %cond.load154
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $4, 52(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_106: ## %else155
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_108
+; AVX512F-NEXT: ## BB#107: ## %cond.load157
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $5, 53(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_108: ## %else158
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_110
+; AVX512F-NEXT: ## BB#109: ## %cond.load160
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $6, 54(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_110: ## %else161
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_112
+; AVX512F-NEXT: ## BB#111: ## %cond.load163
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $7, 55(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_112: ## %else164
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_114
+; AVX512F-NEXT: ## BB#113: ## %cond.load166
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $8, 56(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_114: ## %else167
+; AVX512F-NEXT: kshiftlw $6, %k1, %k2
+; AVX512F-NEXT: kshiftrw $15, %k2, %k2
+; AVX512F-NEXT: kmovw %k2, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_116
+; AVX512F-NEXT: ## BB#115: ## %cond.load169
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $9, 57(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_116: ## %else170
+; AVX512F-NEXT: kshiftlw $5, %k1, %k3
+; AVX512F-NEXT: kshiftrw $15, %k3, %k3
+; AVX512F-NEXT: kmovw %k3, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_118
+; AVX512F-NEXT: ## BB#117: ## %cond.load172
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $10, 58(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_118: ## %else173
+; AVX512F-NEXT: kshiftlw $4, %k1, %k4
+; AVX512F-NEXT: kshiftrw $15, %k4, %k4
+; AVX512F-NEXT: kmovw %k4, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_120
+; AVX512F-NEXT: ## BB#119: ## %cond.load175
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $11, 59(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_120: ## %else176
+; AVX512F-NEXT: kshiftlw $3, %k1, %k5
+; AVX512F-NEXT: kshiftrw $15, %k5, %k5
+; AVX512F-NEXT: kmovw %k5, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_122
+; AVX512F-NEXT: ## BB#121: ## %cond.load178
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $12, 60(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_122: ## %else179
+; AVX512F-NEXT: kshiftlw $2, %k1, %k6
+; AVX512F-NEXT: kshiftrw $15, %k6, %k6
+; AVX512F-NEXT: kmovw %k6, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_124
+; AVX512F-NEXT: ## BB#123: ## %cond.load181
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $13, 61(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_124: ## %else182
+; AVX512F-NEXT: kshiftlw $1, %k1, %k7
+; AVX512F-NEXT: kshiftrw $15, %k7, %k7
+; AVX512F-NEXT: kmovw %k7, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_126
+; AVX512F-NEXT: ## BB#125: ## %cond.load184
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $14, 62(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_126: ## %else185
+; AVX512F-NEXT: kshiftlw $0, %k1, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_128
+; AVX512F-NEXT: ## BB#127: ## %cond.load187
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $15, 63(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_128: ## %else188
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw (%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, (%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw %k2, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw %k3, %r12d
+; AVX512F-NEXT: kmovw %k4, %r15d
+; AVX512F-NEXT: kmovw %k5, %r14d
+; AVX512F-NEXT: kmovw %k6, %ebx
+; AVX512F-NEXT: kmovw %k7, %r11d
+; AVX512F-NEXT: kmovw %k1, %r10d
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %r8d
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %r9d
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %edi
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %esi
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl -{{[0-9]+}}(%rsp), %r13d ## 4-byte Reload
+; AVX512F-NEXT: vmovd %r13d, %xmm2
+; AVX512F-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: movl {{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX512F-NEXT: vmovd %ebp, %xmm3
+; AVX512F-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $9, (%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX512F-NEXT: vmovd %ebp, %xmm6
+; AVX512F-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %r13d
+; AVX512F-NEXT: vpinsrb $10, %r12d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %r12d
+; AVX512F-NEXT: vpinsrb $11, %r15d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %r15d
+; AVX512F-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %r14d
+; AVX512F-NEXT: vpinsrb $13, %ebx, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %ebx
+; AVX512F-NEXT: vpinsrb $14, %r11d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %r11d
+; AVX512F-NEXT: vpinsrb $15, %r10d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %r10d
+; AVX512F-NEXT: vmovd %r8d, %xmm7
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload
+; AVX512F-NEXT: kmovw %k0, %r8d
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm7, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $2, %r9d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $3, %edi, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $8, %r13d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $9, %r12d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $10, %r15d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $11, %r14d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $12, %ebx, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $13, %r11d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $14, %r10d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $15, %r8d, %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: addq $76, %rsp
+; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r12
+; AVX512F-NEXT: popq %r13
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_mask_load_64xi8:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %zmm0, %zmm0
declare <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>)
define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
+; AVX-LABEL: test_mask_load_8xi16:
+; AVX: ## BB#0:
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: ## implicit-def: %XMM1
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_2
+; AVX-NEXT: ## BB#1: ## %cond.load
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: LBB53_2: ## %else
+; AVX-NEXT: vpextrb $2, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_4
+; AVX-NEXT: ## BB#3: ## %cond.load1
+; AVX-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_4: ## %else2
+; AVX-NEXT: vpextrb $4, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_6
+; AVX-NEXT: ## BB#5: ## %cond.load4
+; AVX-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_6: ## %else5
+; AVX-NEXT: vpextrb $6, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_8
+; AVX-NEXT: ## BB#7: ## %cond.load7
+; AVX-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_8: ## %else8
+; AVX-NEXT: vpextrb $8, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_10
+; AVX-NEXT: ## BB#9: ## %cond.load10
+; AVX-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_10: ## %else11
+; AVX-NEXT: vpextrb $10, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_12
+; AVX-NEXT: ## BB#11: ## %cond.load13
+; AVX-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_12: ## %else14
+; AVX-NEXT: vpextrb $12, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_14
+; AVX-NEXT: ## BB#13: ## %cond.load16
+; AVX-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_14: ## %else17
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_16
+; AVX-NEXT: ## BB#15: ## %cond.load19
+; AVX-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_16: ## %else20
+; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_8xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: ## implicit-def: %XMM0
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzwl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: LBB53_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_6: ## %else5
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_8: ## %else8
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_10: ## %else11
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_12: ## %else14
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_14: ## %else17
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_16: ## %else20
+; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
+; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_mask_load_8xi16:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
+; AVX1-LABEL: test_mask_load_16xi16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: ## implicit-def: %YMM1
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_2
+; AVX1-NEXT: ## BB#1: ## %cond.load
+; AVX1-NEXT: movzwl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: LBB54_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_4
+; AVX1-NEXT: ## BB#3: ## %cond.load1
+; AVX1-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_6
+; AVX1-NEXT: ## BB#5: ## %cond.load4
+; AVX1-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_6: ## %else5
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_8
+; AVX1-NEXT: ## BB#7: ## %cond.load7
+; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_8: ## %else8
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_10
+; AVX1-NEXT: ## BB#9: ## %cond.load10
+; AVX1-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_10: ## %else11
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_12
+; AVX1-NEXT: ## BB#11: ## %cond.load13
+; AVX1-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_12: ## %else14
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_14
+; AVX1-NEXT: ## BB#13: ## %cond.load16
+; AVX1-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_14: ## %else17
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_16
+; AVX1-NEXT: ## BB#15: ## %cond.load19
+; AVX1-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_16: ## %else20
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_18
+; AVX1-NEXT: ## BB#17: ## %cond.load22
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_18: ## %else23
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_20
+; AVX1-NEXT: ## BB#19: ## %cond.load25
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_20: ## %else26
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_22
+; AVX1-NEXT: ## BB#21: ## %cond.load28
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $2, 20(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_22: ## %else29
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_24
+; AVX1-NEXT: ## BB#23: ## %cond.load31
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $3, 22(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_24: ## %else32
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_26
+; AVX1-NEXT: ## BB#25: ## %cond.load34
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_26: ## %else35
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_28
+; AVX1-NEXT: ## BB#27: ## %cond.load37
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $5, 26(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_28: ## %else38
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_30
+; AVX1-NEXT: ## BB#29: ## %cond.load40
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_30: ## %else41
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_32
+; AVX1-NEXT: ## BB#31: ## %cond.load43
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_32: ## %else44
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_load_16xi16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: ## implicit-def: %YMM1
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_2
+; AVX2-NEXT: ## BB#1: ## %cond.load
+; AVX2-NEXT: movzwl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: LBB54_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_4
+; AVX2-NEXT: ## BB#3: ## %cond.load1
+; AVX2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_6
+; AVX2-NEXT: ## BB#5: ## %cond.load4
+; AVX2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_6: ## %else5
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_8
+; AVX2-NEXT: ## BB#7: ## %cond.load7
+; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_8: ## %else8
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_10
+; AVX2-NEXT: ## BB#9: ## %cond.load10
+; AVX2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_10: ## %else11
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_12
+; AVX2-NEXT: ## BB#11: ## %cond.load13
+; AVX2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_12: ## %else14
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_14
+; AVX2-NEXT: ## BB#13: ## %cond.load16
+; AVX2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_14: ## %else17
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_16
+; AVX2-NEXT: ## BB#15: ## %cond.load19
+; AVX2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_16: ## %else20
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_18
+; AVX2-NEXT: ## BB#17: ## %cond.load22
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_18: ## %else23
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_20
+; AVX2-NEXT: ## BB#19: ## %cond.load25
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_20: ## %else26
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_22
+; AVX2-NEXT: ## BB#21: ## %cond.load28
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_22: ## %else29
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_24
+; AVX2-NEXT: ## BB#23: ## %cond.load31
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_24: ## %else32
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_26
+; AVX2-NEXT: ## BB#25: ## %cond.load34
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_26: ## %else35
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_28
+; AVX2-NEXT: ## BB#27: ## %cond.load37
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_28: ## %else38
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_30
+; AVX2-NEXT: ## BB#29: ## %cond.load40
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_30: ## %else41
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_32
+; AVX2-NEXT: ## BB#31: ## %cond.load43
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_32: ## %else44
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_16xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: ## implicit-def: %YMM0
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzwl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: LBB54_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_6: ## %else5
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_8: ## %else8
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_10: ## %else11
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_12: ## %else14
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_14: ## %else17
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_16: ## %else20
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_18: ## %else23
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_20: ## %else26
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_22: ## %else29
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_24: ## %else32
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_26: ## %else35
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_28: ## %else38
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_30: ## %else41
+; AVX512F-NEXT: kshiftlw $0, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_32: ## %else44
+; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_mask_load_16xi16:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
declare <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>)
define <32 x i16> @test_mask_load_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) {
+; AVX1-LABEL: test_mask_load_32xi16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_2
+; AVX1-NEXT: ## BB#1: ## %cond.load
+; AVX1-NEXT: movzwl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: LBB55_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_4
+; AVX1-NEXT: ## BB#3: ## %cond.load1
+; AVX1-NEXT: vpinsrw $1, 2(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_6
+; AVX1-NEXT: ## BB#5: ## %cond.load4
+; AVX1-NEXT: vpinsrw $2, 4(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_6: ## %else5
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_8
+; AVX1-NEXT: ## BB#7: ## %cond.load7
+; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_8: ## %else8
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_10
+; AVX1-NEXT: ## BB#9: ## %cond.load10
+; AVX1-NEXT: vpinsrw $4, 8(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_10: ## %else11
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_12
+; AVX1-NEXT: ## BB#11: ## %cond.load13
+; AVX1-NEXT: vpinsrw $5, 10(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_12: ## %else14
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_14
+; AVX1-NEXT: ## BB#13: ## %cond.load16
+; AVX1-NEXT: vpinsrw $6, 12(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_14: ## %else17
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_16
+; AVX1-NEXT: ## BB#15: ## %cond.load19
+; AVX1-NEXT: vpinsrw $7, 14(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_16: ## %else20
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_18
+; AVX1-NEXT: ## BB#17: ## %cond.load22
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_18: ## %else23
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_20
+; AVX1-NEXT: ## BB#19: ## %cond.load25
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_20: ## %else26
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_22
+; AVX1-NEXT: ## BB#21: ## %cond.load28
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $2, 20(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_22: ## %else29
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_24
+; AVX1-NEXT: ## BB#23: ## %cond.load31
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $3, 22(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_24: ## %else32
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_26
+; AVX1-NEXT: ## BB#25: ## %cond.load34
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_26: ## %else35
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_28
+; AVX1-NEXT: ## BB#27: ## %cond.load37
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $5, 26(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_28: ## %else38
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_30
+; AVX1-NEXT: ## BB#29: ## %cond.load40
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_30: ## %else41
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_32
+; AVX1-NEXT: ## BB#31: ## %cond.load43
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_32: ## %else44
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpextrb $0, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_34
+; AVX1-NEXT: ## BB#33: ## %cond.load46
+; AVX1-NEXT: vpinsrw $0, 32(%rdi), %xmm0, %xmm5
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: LBB55_34: ## %else47
+; AVX1-NEXT: vpextrb $1, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_36
+; AVX1-NEXT: ## BB#35: ## %cond.load49
+; AVX1-NEXT: vpinsrw $1, 34(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_36: ## %else50
+; AVX1-NEXT: vpextrb $2, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_38
+; AVX1-NEXT: ## BB#37: ## %cond.load52
+; AVX1-NEXT: vpinsrw $2, 36(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_38: ## %else53
+; AVX1-NEXT: vpextrb $3, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_40
+; AVX1-NEXT: ## BB#39: ## %cond.load55
+; AVX1-NEXT: vpinsrw $3, 38(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_40: ## %else56
+; AVX1-NEXT: vpextrb $4, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_42
+; AVX1-NEXT: ## BB#41: ## %cond.load58
+; AVX1-NEXT: vpinsrw $4, 40(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_42: ## %else59
+; AVX1-NEXT: vpextrb $5, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_44
+; AVX1-NEXT: ## BB#43: ## %cond.load61
+; AVX1-NEXT: vpinsrw $5, 42(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_44: ## %else62
+; AVX1-NEXT: vpextrb $6, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_46
+; AVX1-NEXT: ## BB#45: ## %cond.load64
+; AVX1-NEXT: vpinsrw $6, 44(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_46: ## %else65
+; AVX1-NEXT: vpextrb $7, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_48
+; AVX1-NEXT: ## BB#47: ## %cond.load67
+; AVX1-NEXT: vpinsrw $7, 46(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_48: ## %else68
+; AVX1-NEXT: vpextrb $8, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_50
+; AVX1-NEXT: ## BB#49: ## %cond.load70
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $0, 48(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_50: ## %else71
+; AVX1-NEXT: vpextrb $9, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_52
+; AVX1-NEXT: ## BB#51: ## %cond.load73
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $1, 50(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_52: ## %else74
+; AVX1-NEXT: vpextrb $10, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_54
+; AVX1-NEXT: ## BB#53: ## %cond.load76
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $2, 52(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_54: ## %else77
+; AVX1-NEXT: vpextrb $11, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_56
+; AVX1-NEXT: ## BB#55: ## %cond.load79
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $3, 54(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_56: ## %else80
+; AVX1-NEXT: vpextrb $12, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_58
+; AVX1-NEXT: ## BB#57: ## %cond.load82
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $4, 56(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_58: ## %else83
+; AVX1-NEXT: vpextrb $13, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_60
+; AVX1-NEXT: ## BB#59: ## %cond.load85
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $5, 58(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_60: ## %else86
+; AVX1-NEXT: vpextrb $14, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_62
+; AVX1-NEXT: ## BB#61: ## %cond.load88
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $6, 60(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_62: ## %else89
+; AVX1-NEXT: vpextrb $15, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_64
+; AVX1-NEXT: ## BB#63: ## %cond.load91
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $7, 62(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_64: ## %else92
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpsllw $15, %xmm6, %xmm6
+; AVX1-NEXT: vpsraw $15, %xmm6, %xmm6
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
+; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm1
+; AVX1-NEXT: vandps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $15, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsllw $15, %xmm3, %xmm3
+; AVX1-NEXT: vpsraw $15, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vandnps %ymm2, %ymm1, %ymm2
+; AVX1-NEXT: vandps %ymm1, %ymm5, %ymm1
+; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_load_32xi16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_2
+; AVX2-NEXT: ## BB#1: ## %cond.load
+; AVX2-NEXT: movzwl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: LBB55_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_4
+; AVX2-NEXT: ## BB#3: ## %cond.load1
+; AVX2-NEXT: vpinsrw $1, 2(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_6
+; AVX2-NEXT: ## BB#5: ## %cond.load4
+; AVX2-NEXT: vpinsrw $2, 4(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_6: ## %else5
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_8
+; AVX2-NEXT: ## BB#7: ## %cond.load7
+; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_8: ## %else8
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_10
+; AVX2-NEXT: ## BB#9: ## %cond.load10
+; AVX2-NEXT: vpinsrw $4, 8(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_10: ## %else11
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_12
+; AVX2-NEXT: ## BB#11: ## %cond.load13
+; AVX2-NEXT: vpinsrw $5, 10(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_12: ## %else14
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_14
+; AVX2-NEXT: ## BB#13: ## %cond.load16
+; AVX2-NEXT: vpinsrw $6, 12(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_14: ## %else17
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_16
+; AVX2-NEXT: ## BB#15: ## %cond.load19
+; AVX2-NEXT: vpinsrw $7, 14(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_16: ## %else20
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_18
+; AVX2-NEXT: ## BB#17: ## %cond.load22
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_18: ## %else23
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_20
+; AVX2-NEXT: ## BB#19: ## %cond.load25
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_20: ## %else26
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_22
+; AVX2-NEXT: ## BB#21: ## %cond.load28
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_22: ## %else29
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_24
+; AVX2-NEXT: ## BB#23: ## %cond.load31
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_24: ## %else32
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_26
+; AVX2-NEXT: ## BB#25: ## %cond.load34
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_26: ## %else35
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_28
+; AVX2-NEXT: ## BB#27: ## %cond.load37
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_28: ## %else38
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_30
+; AVX2-NEXT: ## BB#29: ## %cond.load40
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_30: ## %else41
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_32
+; AVX2-NEXT: ## BB#31: ## %cond.load43
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_32: ## %else44
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vpextrb $0, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_34
+; AVX2-NEXT: ## BB#33: ## %cond.load46
+; AVX2-NEXT: vpinsrw $0, 32(%rdi), %xmm0, %xmm5
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: LBB55_34: ## %else47
+; AVX2-NEXT: vpextrb $1, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_36
+; AVX2-NEXT: ## BB#35: ## %cond.load49
+; AVX2-NEXT: vpinsrw $1, 34(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_36: ## %else50
+; AVX2-NEXT: vpextrb $2, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_38
+; AVX2-NEXT: ## BB#37: ## %cond.load52
+; AVX2-NEXT: vpinsrw $2, 36(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_38: ## %else53
+; AVX2-NEXT: vpextrb $3, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_40
+; AVX2-NEXT: ## BB#39: ## %cond.load55
+; AVX2-NEXT: vpinsrw $3, 38(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_40: ## %else56
+; AVX2-NEXT: vpextrb $4, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_42
+; AVX2-NEXT: ## BB#41: ## %cond.load58
+; AVX2-NEXT: vpinsrw $4, 40(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_42: ## %else59
+; AVX2-NEXT: vpextrb $5, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_44
+; AVX2-NEXT: ## BB#43: ## %cond.load61
+; AVX2-NEXT: vpinsrw $5, 42(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_44: ## %else62
+; AVX2-NEXT: vpextrb $6, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_46
+; AVX2-NEXT: ## BB#45: ## %cond.load64
+; AVX2-NEXT: vpinsrw $6, 44(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_46: ## %else65
+; AVX2-NEXT: vpextrb $7, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_48
+; AVX2-NEXT: ## BB#47: ## %cond.load67
+; AVX2-NEXT: vpinsrw $7, 46(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_48: ## %else68
+; AVX2-NEXT: vpextrb $8, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_50
+; AVX2-NEXT: ## BB#49: ## %cond.load70
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $0, 48(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_50: ## %else71
+; AVX2-NEXT: vpextrb $9, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_52
+; AVX2-NEXT: ## BB#51: ## %cond.load73
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $1, 50(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_52: ## %else74
+; AVX2-NEXT: vpextrb $10, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_54
+; AVX2-NEXT: ## BB#53: ## %cond.load76
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $2, 52(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_54: ## %else77
+; AVX2-NEXT: vpextrb $11, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_56
+; AVX2-NEXT: ## BB#55: ## %cond.load79
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $3, 54(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_56: ## %else80
+; AVX2-NEXT: vpextrb $12, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_58
+; AVX2-NEXT: ## BB#57: ## %cond.load82
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $4, 56(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_58: ## %else83
+; AVX2-NEXT: vpextrb $13, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_60
+; AVX2-NEXT: ## BB#59: ## %cond.load85
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $5, 58(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_60: ## %else86
+; AVX2-NEXT: vpextrb $14, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_62
+; AVX2-NEXT: ## BB#61: ## %cond.load88
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $6, 60(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_62: ## %else89
+; AVX2-NEXT: vpextrb $15, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_64
+; AVX2-NEXT: ## BB#63: ## %cond.load91
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $7, 62(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_64: ## %else92
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1
+; AVX2-NEXT: vpsraw $15, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm5, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_32xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzwl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm3
+; AVX512F-NEXT: LBB55_2: ## %else
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_4: ## %else2
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_6: ## %else5
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_8: ## %else8
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_10: ## %else11
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_12: ## %else14
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_14: ## %else17
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_16: ## %else20
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_18: ## %else23
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_20: ## %else26
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_22: ## %else29
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_24: ## %else32
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_26: ## %else35
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_28: ## %else38
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_30: ## %else41
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_32: ## %else44
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vpextrb $0, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_34
+; AVX512F-NEXT: ## BB#33: ## %cond.load46
+; AVX512F-NEXT: vpinsrw $0, 32(%rdi), %xmm0, %xmm5
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB55_34: ## %else47
+; AVX512F-NEXT: vpextrb $1, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_36
+; AVX512F-NEXT: ## BB#35: ## %cond.load49
+; AVX512F-NEXT: vpinsrw $1, 34(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_36: ## %else50
+; AVX512F-NEXT: vpextrb $2, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_38
+; AVX512F-NEXT: ## BB#37: ## %cond.load52
+; AVX512F-NEXT: vpinsrw $2, 36(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_38: ## %else53
+; AVX512F-NEXT: vpextrb $3, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_40
+; AVX512F-NEXT: ## BB#39: ## %cond.load55
+; AVX512F-NEXT: vpinsrw $3, 38(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_40: ## %else56
+; AVX512F-NEXT: vpextrb $4, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_42
+; AVX512F-NEXT: ## BB#41: ## %cond.load58
+; AVX512F-NEXT: vpinsrw $4, 40(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_42: ## %else59
+; AVX512F-NEXT: vpextrb $5, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_44
+; AVX512F-NEXT: ## BB#43: ## %cond.load61
+; AVX512F-NEXT: vpinsrw $5, 42(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_44: ## %else62
+; AVX512F-NEXT: vpextrb $6, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_46
+; AVX512F-NEXT: ## BB#45: ## %cond.load64
+; AVX512F-NEXT: vpinsrw $6, 44(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_46: ## %else65
+; AVX512F-NEXT: vpextrb $7, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_48
+; AVX512F-NEXT: ## BB#47: ## %cond.load67
+; AVX512F-NEXT: vpinsrw $7, 46(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_48: ## %else68
+; AVX512F-NEXT: vpextrb $8, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_50
+; AVX512F-NEXT: ## BB#49: ## %cond.load70
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $0, 48(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_50: ## %else71
+; AVX512F-NEXT: vpextrb $9, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_52
+; AVX512F-NEXT: ## BB#51: ## %cond.load73
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $1, 50(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_52: ## %else74
+; AVX512F-NEXT: vpextrb $10, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_54
+; AVX512F-NEXT: ## BB#53: ## %cond.load76
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $2, 52(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_54: ## %else77
+; AVX512F-NEXT: vpextrb $11, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_56
+; AVX512F-NEXT: ## BB#55: ## %cond.load79
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $3, 54(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_56: ## %else80
+; AVX512F-NEXT: vpextrb $12, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_58
+; AVX512F-NEXT: ## BB#57: ## %cond.load82
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $4, 56(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_58: ## %else83
+; AVX512F-NEXT: vpextrb $13, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_60
+; AVX512F-NEXT: ## BB#59: ## %cond.load85
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $5, 58(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_60: ## %else86
+; AVX512F-NEXT: vpextrb $14, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_62
+; AVX512F-NEXT: ## BB#61: ## %cond.load88
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $6, 60(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_62: ## %else89
+; AVX512F-NEXT: vpextrb $15, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_64
+; AVX512F-NEXT: ## BB#63: ## %cond.load91
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $7, 62(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_64: ## %else92
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT: vpsllw $15, %ymm0, %ymm0
+; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512F-NEXT: vpsllw $15, %ymm1, %ymm1
+; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm5, %ymm2, %ymm1
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_mask_load_32xi16:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
declare <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>)
define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
+; AVX-LABEL: test_mask_store_16xi8:
+; AVX: ## BB#0:
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_2
+; AVX-NEXT: ## BB#1: ## %cond.store
+; AVX-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX-NEXT: LBB56_2: ## %else
+; AVX-NEXT: vpextrb $1, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_4
+; AVX-NEXT: ## BB#3: ## %cond.store1
+; AVX-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX-NEXT: LBB56_4: ## %else2
+; AVX-NEXT: vpextrb $2, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_6
+; AVX-NEXT: ## BB#5: ## %cond.store3
+; AVX-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX-NEXT: LBB56_6: ## %else4
+; AVX-NEXT: vpextrb $3, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_8
+; AVX-NEXT: ## BB#7: ## %cond.store5
+; AVX-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX-NEXT: LBB56_8: ## %else6
+; AVX-NEXT: vpextrb $4, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_10
+; AVX-NEXT: ## BB#9: ## %cond.store7
+; AVX-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX-NEXT: LBB56_10: ## %else8
+; AVX-NEXT: vpextrb $5, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_12
+; AVX-NEXT: ## BB#11: ## %cond.store9
+; AVX-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX-NEXT: LBB56_12: ## %else10
+; AVX-NEXT: vpextrb $6, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_14
+; AVX-NEXT: ## BB#13: ## %cond.store11
+; AVX-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX-NEXT: LBB56_14: ## %else12
+; AVX-NEXT: vpextrb $7, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_16
+; AVX-NEXT: ## BB#15: ## %cond.store13
+; AVX-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX-NEXT: LBB56_16: ## %else14
+; AVX-NEXT: vpextrb $8, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_18
+; AVX-NEXT: ## BB#17: ## %cond.store15
+; AVX-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX-NEXT: LBB56_18: ## %else16
+; AVX-NEXT: vpextrb $9, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_20
+; AVX-NEXT: ## BB#19: ## %cond.store17
+; AVX-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX-NEXT: LBB56_20: ## %else18
+; AVX-NEXT: vpextrb $10, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_22
+; AVX-NEXT: ## BB#21: ## %cond.store19
+; AVX-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX-NEXT: LBB56_22: ## %else20
+; AVX-NEXT: vpextrb $11, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_24
+; AVX-NEXT: ## BB#23: ## %cond.store21
+; AVX-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX-NEXT: LBB56_24: ## %else22
+; AVX-NEXT: vpextrb $12, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_26
+; AVX-NEXT: ## BB#25: ## %cond.store23
+; AVX-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX-NEXT: LBB56_26: ## %else24
+; AVX-NEXT: vpextrb $13, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_28
+; AVX-NEXT: ## BB#27: ## %cond.store25
+; AVX-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX-NEXT: LBB56_28: ## %else26
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_30
+; AVX-NEXT: ## BB#29: ## %cond.store27
+; AVX-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX-NEXT: LBB56_30: ## %else28
+; AVX-NEXT: vpextrb $15, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_32
+; AVX-NEXT: ## BB#31: ## %cond.store29
+; AVX-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX-NEXT: LBB56_32: ## %else30
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_16xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX512F-NEXT: LBB56_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX512F-NEXT: LBB56_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB56_6: ## %else4
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX512F-NEXT: LBB56_8: ## %else6
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB56_10: ## %else8
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX512F-NEXT: LBB56_12: ## %else10
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB56_14: ## %else12
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX512F-NEXT: LBB56_16: ## %else14
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB56_18: ## %else16
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX512F-NEXT: LBB56_20: ## %else18
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB56_22: ## %else20
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX512F-NEXT: LBB56_24: ## %else22
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB56_26: ## %else24
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX512F-NEXT: LBB56_28: ## %else26
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB56_30: ## %else28
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX512F-NEXT: LBB56_32: ## %else30
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_mask_store_16xi8:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
+; AVX1-LABEL: test_mask_store_32xi8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_2
+; AVX1-NEXT: ## BB#1: ## %cond.store
+; AVX1-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX1-NEXT: LBB57_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_4
+; AVX1-NEXT: ## BB#3: ## %cond.store1
+; AVX1-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX1-NEXT: LBB57_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_6
+; AVX1-NEXT: ## BB#5: ## %cond.store3
+; AVX1-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX1-NEXT: LBB57_6: ## %else4
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_8
+; AVX1-NEXT: ## BB#7: ## %cond.store5
+; AVX1-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX1-NEXT: LBB57_8: ## %else6
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_10
+; AVX1-NEXT: ## BB#9: ## %cond.store7
+; AVX1-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX1-NEXT: LBB57_10: ## %else8
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_12
+; AVX1-NEXT: ## BB#11: ## %cond.store9
+; AVX1-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX1-NEXT: LBB57_12: ## %else10
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_14
+; AVX1-NEXT: ## BB#13: ## %cond.store11
+; AVX1-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX1-NEXT: LBB57_14: ## %else12
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_16
+; AVX1-NEXT: ## BB#15: ## %cond.store13
+; AVX1-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX1-NEXT: LBB57_16: ## %else14
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_18
+; AVX1-NEXT: ## BB#17: ## %cond.store15
+; AVX1-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX1-NEXT: LBB57_18: ## %else16
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_20
+; AVX1-NEXT: ## BB#19: ## %cond.store17
+; AVX1-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX1-NEXT: LBB57_20: ## %else18
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_22
+; AVX1-NEXT: ## BB#21: ## %cond.store19
+; AVX1-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX1-NEXT: LBB57_22: ## %else20
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_24
+; AVX1-NEXT: ## BB#23: ## %cond.store21
+; AVX1-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX1-NEXT: LBB57_24: ## %else22
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_26
+; AVX1-NEXT: ## BB#25: ## %cond.store23
+; AVX1-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX1-NEXT: LBB57_26: ## %else24
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_28
+; AVX1-NEXT: ## BB#27: ## %cond.store25
+; AVX1-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX1-NEXT: LBB57_28: ## %else26
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_30
+; AVX1-NEXT: ## BB#29: ## %cond.store27
+; AVX1-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX1-NEXT: LBB57_30: ## %else28
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_32
+; AVX1-NEXT: ## BB#31: ## %cond.store29
+; AVX1-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX1-NEXT: LBB57_32: ## %else30
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_34
+; AVX1-NEXT: ## BB#33: ## %cond.store31
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $0, %xmm2, 16(%rdi)
+; AVX1-NEXT: LBB57_34: ## %else32
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_36
+; AVX1-NEXT: ## BB#35: ## %cond.store33
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $1, %xmm2, 17(%rdi)
+; AVX1-NEXT: LBB57_36: ## %else34
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_38
+; AVX1-NEXT: ## BB#37: ## %cond.store35
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $2, %xmm2, 18(%rdi)
+; AVX1-NEXT: LBB57_38: ## %else36
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_40
+; AVX1-NEXT: ## BB#39: ## %cond.store37
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $3, %xmm2, 19(%rdi)
+; AVX1-NEXT: LBB57_40: ## %else38
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_42
+; AVX1-NEXT: ## BB#41: ## %cond.store39
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $4, %xmm2, 20(%rdi)
+; AVX1-NEXT: LBB57_42: ## %else40
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_44
+; AVX1-NEXT: ## BB#43: ## %cond.store41
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $5, %xmm2, 21(%rdi)
+; AVX1-NEXT: LBB57_44: ## %else42
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_46
+; AVX1-NEXT: ## BB#45: ## %cond.store43
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $6, %xmm2, 22(%rdi)
+; AVX1-NEXT: LBB57_46: ## %else44
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_48
+; AVX1-NEXT: ## BB#47: ## %cond.store45
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $7, %xmm2, 23(%rdi)
+; AVX1-NEXT: LBB57_48: ## %else46
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_50
+; AVX1-NEXT: ## BB#49: ## %cond.store47
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $8, %xmm2, 24(%rdi)
+; AVX1-NEXT: LBB57_50: ## %else48
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_52
+; AVX1-NEXT: ## BB#51: ## %cond.store49
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $9, %xmm2, 25(%rdi)
+; AVX1-NEXT: LBB57_52: ## %else50
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_54
+; AVX1-NEXT: ## BB#53: ## %cond.store51
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $10, %xmm2, 26(%rdi)
+; AVX1-NEXT: LBB57_54: ## %else52
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_56
+; AVX1-NEXT: ## BB#55: ## %cond.store53
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $11, %xmm2, 27(%rdi)
+; AVX1-NEXT: LBB57_56: ## %else54
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_58
+; AVX1-NEXT: ## BB#57: ## %cond.store55
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $12, %xmm2, 28(%rdi)
+; AVX1-NEXT: LBB57_58: ## %else56
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_60
+; AVX1-NEXT: ## BB#59: ## %cond.store57
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $13, %xmm2, 29(%rdi)
+; AVX1-NEXT: LBB57_60: ## %else58
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_62
+; AVX1-NEXT: ## BB#61: ## %cond.store59
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $14, %xmm2, 30(%rdi)
+; AVX1-NEXT: LBB57_62: ## %else60
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_64
+; AVX1-NEXT: ## BB#63: ## %cond.store61
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, 31(%rdi)
+; AVX1-NEXT: LBB57_64: ## %else62
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_store_32xi8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_2
+; AVX2-NEXT: ## BB#1: ## %cond.store
+; AVX2-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX2-NEXT: LBB57_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_4
+; AVX2-NEXT: ## BB#3: ## %cond.store1
+; AVX2-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX2-NEXT: LBB57_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_6
+; AVX2-NEXT: ## BB#5: ## %cond.store3
+; AVX2-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX2-NEXT: LBB57_6: ## %else4
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_8
+; AVX2-NEXT: ## BB#7: ## %cond.store5
+; AVX2-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX2-NEXT: LBB57_8: ## %else6
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_10
+; AVX2-NEXT: ## BB#9: ## %cond.store7
+; AVX2-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX2-NEXT: LBB57_10: ## %else8
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_12
+; AVX2-NEXT: ## BB#11: ## %cond.store9
+; AVX2-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX2-NEXT: LBB57_12: ## %else10
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_14
+; AVX2-NEXT: ## BB#13: ## %cond.store11
+; AVX2-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX2-NEXT: LBB57_14: ## %else12
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_16
+; AVX2-NEXT: ## BB#15: ## %cond.store13
+; AVX2-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX2-NEXT: LBB57_16: ## %else14
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_18
+; AVX2-NEXT: ## BB#17: ## %cond.store15
+; AVX2-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX2-NEXT: LBB57_18: ## %else16
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_20
+; AVX2-NEXT: ## BB#19: ## %cond.store17
+; AVX2-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX2-NEXT: LBB57_20: ## %else18
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_22
+; AVX2-NEXT: ## BB#21: ## %cond.store19
+; AVX2-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX2-NEXT: LBB57_22: ## %else20
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_24
+; AVX2-NEXT: ## BB#23: ## %cond.store21
+; AVX2-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX2-NEXT: LBB57_24: ## %else22
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_26
+; AVX2-NEXT: ## BB#25: ## %cond.store23
+; AVX2-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX2-NEXT: LBB57_26: ## %else24
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_28
+; AVX2-NEXT: ## BB#27: ## %cond.store25
+; AVX2-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX2-NEXT: LBB57_28: ## %else26
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_30
+; AVX2-NEXT: ## BB#29: ## %cond.store27
+; AVX2-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX2-NEXT: LBB57_30: ## %else28
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_32
+; AVX2-NEXT: ## BB#31: ## %cond.store29
+; AVX2-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX2-NEXT: LBB57_32: ## %else30
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_34
+; AVX2-NEXT: ## BB#33: ## %cond.store31
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $0, %xmm2, 16(%rdi)
+; AVX2-NEXT: LBB57_34: ## %else32
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_36
+; AVX2-NEXT: ## BB#35: ## %cond.store33
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $1, %xmm2, 17(%rdi)
+; AVX2-NEXT: LBB57_36: ## %else34
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_38
+; AVX2-NEXT: ## BB#37: ## %cond.store35
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $2, %xmm2, 18(%rdi)
+; AVX2-NEXT: LBB57_38: ## %else36
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_40
+; AVX2-NEXT: ## BB#39: ## %cond.store37
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $3, %xmm2, 19(%rdi)
+; AVX2-NEXT: LBB57_40: ## %else38
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_42
+; AVX2-NEXT: ## BB#41: ## %cond.store39
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $4, %xmm2, 20(%rdi)
+; AVX2-NEXT: LBB57_42: ## %else40
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_44
+; AVX2-NEXT: ## BB#43: ## %cond.store41
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $5, %xmm2, 21(%rdi)
+; AVX2-NEXT: LBB57_44: ## %else42
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_46
+; AVX2-NEXT: ## BB#45: ## %cond.store43
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $6, %xmm2, 22(%rdi)
+; AVX2-NEXT: LBB57_46: ## %else44
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_48
+; AVX2-NEXT: ## BB#47: ## %cond.store45
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $7, %xmm2, 23(%rdi)
+; AVX2-NEXT: LBB57_48: ## %else46
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_50
+; AVX2-NEXT: ## BB#49: ## %cond.store47
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $8, %xmm2, 24(%rdi)
+; AVX2-NEXT: LBB57_50: ## %else48
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_52
+; AVX2-NEXT: ## BB#51: ## %cond.store49
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $9, %xmm2, 25(%rdi)
+; AVX2-NEXT: LBB57_52: ## %else50
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_54
+; AVX2-NEXT: ## BB#53: ## %cond.store51
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $10, %xmm2, 26(%rdi)
+; AVX2-NEXT: LBB57_54: ## %else52
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_56
+; AVX2-NEXT: ## BB#55: ## %cond.store53
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $11, %xmm2, 27(%rdi)
+; AVX2-NEXT: LBB57_56: ## %else54
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_58
+; AVX2-NEXT: ## BB#57: ## %cond.store55
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $12, %xmm2, 28(%rdi)
+; AVX2-NEXT: LBB57_58: ## %else56
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_60
+; AVX2-NEXT: ## BB#59: ## %cond.store57
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $13, %xmm2, 29(%rdi)
+; AVX2-NEXT: LBB57_60: ## %else58
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_62
+; AVX2-NEXT: ## BB#61: ## %cond.store59
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $14, %xmm2, 30(%rdi)
+; AVX2-NEXT: LBB57_62: ## %else60
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_64
+; AVX2-NEXT: ## BB#63: ## %cond.store61
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $15, %xmm0, 31(%rdi)
+; AVX2-NEXT: LBB57_64: ## %else62
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_32xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX512F-NEXT: LBB57_2: ## %else
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX512F-NEXT: LBB57_4: ## %else2
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB57_6: ## %else4
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX512F-NEXT: LBB57_8: ## %else6
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB57_10: ## %else8
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX512F-NEXT: LBB57_12: ## %else10
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB57_14: ## %else12
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX512F-NEXT: LBB57_16: ## %else14
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB57_18: ## %else16
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX512F-NEXT: LBB57_20: ## %else18
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB57_22: ## %else20
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX512F-NEXT: LBB57_24: ## %else22
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB57_26: ## %else24
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX512F-NEXT: LBB57_28: ## %else26
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB57_30: ## %else28
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX512F-NEXT: LBB57_32: ## %else30
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_34
+; AVX512F-NEXT: ## BB#33: ## %cond.store31
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $0, %xmm2, 16(%rdi)
+; AVX512F-NEXT: LBB57_34: ## %else32
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_36
+; AVX512F-NEXT: ## BB#35: ## %cond.store33
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $1, %xmm2, 17(%rdi)
+; AVX512F-NEXT: LBB57_36: ## %else34
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_38
+; AVX512F-NEXT: ## BB#37: ## %cond.store35
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $2, %xmm2, 18(%rdi)
+; AVX512F-NEXT: LBB57_38: ## %else36
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_40
+; AVX512F-NEXT: ## BB#39: ## %cond.store37
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $3, %xmm2, 19(%rdi)
+; AVX512F-NEXT: LBB57_40: ## %else38
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_42
+; AVX512F-NEXT: ## BB#41: ## %cond.store39
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $4, %xmm2, 20(%rdi)
+; AVX512F-NEXT: LBB57_42: ## %else40
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_44
+; AVX512F-NEXT: ## BB#43: ## %cond.store41
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $5, %xmm2, 21(%rdi)
+; AVX512F-NEXT: LBB57_44: ## %else42
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_46
+; AVX512F-NEXT: ## BB#45: ## %cond.store43
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $6, %xmm2, 22(%rdi)
+; AVX512F-NEXT: LBB57_46: ## %else44
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_48
+; AVX512F-NEXT: ## BB#47: ## %cond.store45
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $7, %xmm2, 23(%rdi)
+; AVX512F-NEXT: LBB57_48: ## %else46
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_50
+; AVX512F-NEXT: ## BB#49: ## %cond.store47
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $8, %xmm2, 24(%rdi)
+; AVX512F-NEXT: LBB57_50: ## %else48
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_52
+; AVX512F-NEXT: ## BB#51: ## %cond.store49
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $9, %xmm2, 25(%rdi)
+; AVX512F-NEXT: LBB57_52: ## %else50
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_54
+; AVX512F-NEXT: ## BB#53: ## %cond.store51
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $10, %xmm2, 26(%rdi)
+; AVX512F-NEXT: LBB57_54: ## %else52
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_56
+; AVX512F-NEXT: ## BB#55: ## %cond.store53
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $11, %xmm2, 27(%rdi)
+; AVX512F-NEXT: LBB57_56: ## %else54
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_58
+; AVX512F-NEXT: ## BB#57: ## %cond.store55
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $12, %xmm2, 28(%rdi)
+; AVX512F-NEXT: LBB57_58: ## %else56
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_60
+; AVX512F-NEXT: ## BB#59: ## %cond.store57
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $13, %xmm2, 29(%rdi)
+; AVX512F-NEXT: LBB57_60: ## %else58
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_62
+; AVX512F-NEXT: ## BB#61: ## %cond.store59
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $14, %xmm2, 30(%rdi)
+; AVX512F-NEXT: LBB57_62: ## %else60
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_64
+; AVX512F-NEXT: ## BB#63: ## %cond.store61
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrb $15, %xmm0, 31(%rdi)
+; AVX512F-NEXT: LBB57_64: ## %else62
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_mask_store_32xi8:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
declare void @llvm.masked.store.v32i8.p0v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>)
define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) {
+; AVX1-LABEL: test_mask_store_64xi8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: testb $1, %dil
+; AVX1-NEXT: je LBB58_2
+; AVX1-NEXT: ## BB#1: ## %cond.store
+; AVX1-NEXT: vpextrb $0, %xmm0, (%rax)
+; AVX1-NEXT: LBB58_2: ## %else
+; AVX1-NEXT: testb $1, %sil
+; AVX1-NEXT: je LBB58_4
+; AVX1-NEXT: ## BB#3: ## %cond.store1
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rax)
+; AVX1-NEXT: LBB58_4: ## %else2
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_6
+; AVX1-NEXT: ## BB#5: ## %cond.store3
+; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rax)
+; AVX1-NEXT: LBB58_6: ## %else4
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_8
+; AVX1-NEXT: ## BB#7: ## %cond.store5
+; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rax)
+; AVX1-NEXT: LBB58_8: ## %else6
+; AVX1-NEXT: testb $1, %r8b
+; AVX1-NEXT: je LBB58_10
+; AVX1-NEXT: ## BB#9: ## %cond.store7
+; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rax)
+; AVX1-NEXT: LBB58_10: ## %else8
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %r9b
+; AVX1-NEXT: je LBB58_12
+; AVX1-NEXT: ## BB#11: ## %cond.store9
+; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rax)
+; AVX1-NEXT: LBB58_12: ## %else10
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_14
+; AVX1-NEXT: ## BB#13: ## %cond.store11
+; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rax)
+; AVX1-NEXT: LBB58_14: ## %else12
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_16
+; AVX1-NEXT: ## BB#15: ## %cond.store13
+; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rax)
+; AVX1-NEXT: LBB58_16: ## %else14
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_18
+; AVX1-NEXT: ## BB#17: ## %cond.store15
+; AVX1-NEXT: vpextrb $8, %xmm0, 8(%rax)
+; AVX1-NEXT: LBB58_18: ## %else16
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_20
+; AVX1-NEXT: ## BB#19: ## %cond.store17
+; AVX1-NEXT: vpextrb $9, %xmm0, 9(%rax)
+; AVX1-NEXT: LBB58_20: ## %else18
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_22
+; AVX1-NEXT: ## BB#21: ## %cond.store19
+; AVX1-NEXT: vpextrb $10, %xmm0, 10(%rax)
+; AVX1-NEXT: LBB58_22: ## %else20
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_24
+; AVX1-NEXT: ## BB#23: ## %cond.store21
+; AVX1-NEXT: vpextrb $11, %xmm0, 11(%rax)
+; AVX1-NEXT: LBB58_24: ## %else22
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_26
+; AVX1-NEXT: ## BB#25: ## %cond.store23
+; AVX1-NEXT: vpextrb $12, %xmm0, 12(%rax)
+; AVX1-NEXT: LBB58_26: ## %else24
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_28
+; AVX1-NEXT: ## BB#27: ## %cond.store25
+; AVX1-NEXT: vpextrb $13, %xmm0, 13(%rax)
+; AVX1-NEXT: LBB58_28: ## %else26
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_30
+; AVX1-NEXT: ## BB#29: ## %cond.store27
+; AVX1-NEXT: vpextrb $14, %xmm0, 14(%rax)
+; AVX1-NEXT: LBB58_30: ## %else28
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_32
+; AVX1-NEXT: ## BB#31: ## %cond.store29
+; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rax)
+; AVX1-NEXT: LBB58_32: ## %else30
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_34
+; AVX1-NEXT: ## BB#33: ## %cond.store31
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $0, %xmm2, 16(%rax)
+; AVX1-NEXT: LBB58_34: ## %else32
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_36
+; AVX1-NEXT: ## BB#35: ## %cond.store33
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $1, %xmm2, 17(%rax)
+; AVX1-NEXT: LBB58_36: ## %else34
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_38
+; AVX1-NEXT: ## BB#37: ## %cond.store35
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $2, %xmm2, 18(%rax)
+; AVX1-NEXT: LBB58_38: ## %else36
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_40
+; AVX1-NEXT: ## BB#39: ## %cond.store37
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $3, %xmm2, 19(%rax)
+; AVX1-NEXT: LBB58_40: ## %else38
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_42
+; AVX1-NEXT: ## BB#41: ## %cond.store39
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $4, %xmm2, 20(%rax)
+; AVX1-NEXT: LBB58_42: ## %else40
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_44
+; AVX1-NEXT: ## BB#43: ## %cond.store41
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $5, %xmm2, 21(%rax)
+; AVX1-NEXT: LBB58_44: ## %else42
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_46
+; AVX1-NEXT: ## BB#45: ## %cond.store43
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $6, %xmm2, 22(%rax)
+; AVX1-NEXT: LBB58_46: ## %else44
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_48
+; AVX1-NEXT: ## BB#47: ## %cond.store45
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $7, %xmm2, 23(%rax)
+; AVX1-NEXT: LBB58_48: ## %else46
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_50
+; AVX1-NEXT: ## BB#49: ## %cond.store47
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $8, %xmm2, 24(%rax)
+; AVX1-NEXT: LBB58_50: ## %else48
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_52
+; AVX1-NEXT: ## BB#51: ## %cond.store49
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $9, %xmm2, 25(%rax)
+; AVX1-NEXT: LBB58_52: ## %else50
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_54
+; AVX1-NEXT: ## BB#53: ## %cond.store51
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $10, %xmm2, 26(%rax)
+; AVX1-NEXT: LBB58_54: ## %else52
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_56
+; AVX1-NEXT: ## BB#55: ## %cond.store53
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $11, %xmm2, 27(%rax)
+; AVX1-NEXT: LBB58_56: ## %else54
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_58
+; AVX1-NEXT: ## BB#57: ## %cond.store55
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $12, %xmm2, 28(%rax)
+; AVX1-NEXT: LBB58_58: ## %else56
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_60
+; AVX1-NEXT: ## BB#59: ## %cond.store57
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $13, %xmm2, 29(%rax)
+; AVX1-NEXT: LBB58_60: ## %else58
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_62
+; AVX1-NEXT: ## BB#61: ## %cond.store59
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $14, %xmm2, 30(%rax)
+; AVX1-NEXT: LBB58_62: ## %else60
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_64
+; AVX1-NEXT: ## BB#63: ## %cond.store61
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, 31(%rax)
+; AVX1-NEXT: LBB58_64: ## %else62
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_66
+; AVX1-NEXT: ## BB#65: ## %cond.store63
+; AVX1-NEXT: vpextrb $0, %xmm1, 32(%rax)
+; AVX1-NEXT: LBB58_66: ## %else64
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_68
+; AVX1-NEXT: ## BB#67: ## %cond.store65
+; AVX1-NEXT: vpextrb $1, %xmm1, 33(%rax)
+; AVX1-NEXT: LBB58_68: ## %else66
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_70
+; AVX1-NEXT: ## BB#69: ## %cond.store67
+; AVX1-NEXT: vpextrb $2, %xmm1, 34(%rax)
+; AVX1-NEXT: LBB58_70: ## %else68
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_72
+; AVX1-NEXT: ## BB#71: ## %cond.store69
+; AVX1-NEXT: vpextrb $3, %xmm1, 35(%rax)
+; AVX1-NEXT: LBB58_72: ## %else70
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_74
+; AVX1-NEXT: ## BB#73: ## %cond.store71
+; AVX1-NEXT: vpextrb $4, %xmm1, 36(%rax)
+; AVX1-NEXT: LBB58_74: ## %else72
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_76
+; AVX1-NEXT: ## BB#75: ## %cond.store73
+; AVX1-NEXT: vpextrb $5, %xmm1, 37(%rax)
+; AVX1-NEXT: LBB58_76: ## %else74
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_78
+; AVX1-NEXT: ## BB#77: ## %cond.store75
+; AVX1-NEXT: vpextrb $6, %xmm1, 38(%rax)
+; AVX1-NEXT: LBB58_78: ## %else76
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_80
+; AVX1-NEXT: ## BB#79: ## %cond.store77
+; AVX1-NEXT: vpextrb $7, %xmm1, 39(%rax)
+; AVX1-NEXT: LBB58_80: ## %else78
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_82
+; AVX1-NEXT: ## BB#81: ## %cond.store79
+; AVX1-NEXT: vpextrb $8, %xmm1, 40(%rax)
+; AVX1-NEXT: LBB58_82: ## %else80
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_84
+; AVX1-NEXT: ## BB#83: ## %cond.store81
+; AVX1-NEXT: vpextrb $9, %xmm1, 41(%rax)
+; AVX1-NEXT: LBB58_84: ## %else82
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_86
+; AVX1-NEXT: ## BB#85: ## %cond.store83
+; AVX1-NEXT: vpextrb $10, %xmm1, 42(%rax)
+; AVX1-NEXT: LBB58_86: ## %else84
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_88
+; AVX1-NEXT: ## BB#87: ## %cond.store85
+; AVX1-NEXT: vpextrb $11, %xmm1, 43(%rax)
+; AVX1-NEXT: LBB58_88: ## %else86
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_90
+; AVX1-NEXT: ## BB#89: ## %cond.store87
+; AVX1-NEXT: vpextrb $12, %xmm1, 44(%rax)
+; AVX1-NEXT: LBB58_90: ## %else88
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_92
+; AVX1-NEXT: ## BB#91: ## %cond.store89
+; AVX1-NEXT: vpextrb $13, %xmm1, 45(%rax)
+; AVX1-NEXT: LBB58_92: ## %else90
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_94
+; AVX1-NEXT: ## BB#93: ## %cond.store91
+; AVX1-NEXT: vpextrb $14, %xmm1, 46(%rax)
+; AVX1-NEXT: LBB58_94: ## %else92
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_96
+; AVX1-NEXT: ## BB#95: ## %cond.store93
+; AVX1-NEXT: vpextrb $15, %xmm1, 47(%rax)
+; AVX1-NEXT: LBB58_96: ## %else94
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_98
+; AVX1-NEXT: ## BB#97: ## %cond.store95
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, 48(%rax)
+; AVX1-NEXT: LBB58_98: ## %else96
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_100
+; AVX1-NEXT: ## BB#99: ## %cond.store97
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $1, %xmm0, 49(%rax)
+; AVX1-NEXT: LBB58_100: ## %else98
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_102
+; AVX1-NEXT: ## BB#101: ## %cond.store99
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $2, %xmm0, 50(%rax)
+; AVX1-NEXT: LBB58_102: ## %else100
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_104
+; AVX1-NEXT: ## BB#103: ## %cond.store101
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $3, %xmm0, 51(%rax)
+; AVX1-NEXT: LBB58_104: ## %else102
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_106
+; AVX1-NEXT: ## BB#105: ## %cond.store103
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $4, %xmm0, 52(%rax)
+; AVX1-NEXT: LBB58_106: ## %else104
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_108
+; AVX1-NEXT: ## BB#107: ## %cond.store105
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $5, %xmm0, 53(%rax)
+; AVX1-NEXT: LBB58_108: ## %else106
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_110
+; AVX1-NEXT: ## BB#109: ## %cond.store107
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $6, %xmm0, 54(%rax)
+; AVX1-NEXT: LBB58_110: ## %else108
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_112
+; AVX1-NEXT: ## BB#111: ## %cond.store109
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $7, %xmm0, 55(%rax)
+; AVX1-NEXT: LBB58_112: ## %else110
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_114
+; AVX1-NEXT: ## BB#113: ## %cond.store111
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $8, %xmm0, 56(%rax)
+; AVX1-NEXT: LBB58_114: ## %else112
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_116
+; AVX1-NEXT: ## BB#115: ## %cond.store113
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $9, %xmm0, 57(%rax)
+; AVX1-NEXT: LBB58_116: ## %else114
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_118
+; AVX1-NEXT: ## BB#117: ## %cond.store115
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $10, %xmm0, 58(%rax)
+; AVX1-NEXT: LBB58_118: ## %else116
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_120
+; AVX1-NEXT: ## BB#119: ## %cond.store117
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $11, %xmm0, 59(%rax)
+; AVX1-NEXT: LBB58_120: ## %else118
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_122
+; AVX1-NEXT: ## BB#121: ## %cond.store119
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $12, %xmm0, 60(%rax)
+; AVX1-NEXT: LBB58_122: ## %else120
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_124
+; AVX1-NEXT: ## BB#123: ## %cond.store121
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $13, %xmm0, 61(%rax)
+; AVX1-NEXT: LBB58_124: ## %else122
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_126
+; AVX1-NEXT: ## BB#125: ## %cond.store123
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $14, %xmm0, 62(%rax)
+; AVX1-NEXT: LBB58_126: ## %else124
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_128
+; AVX1-NEXT: ## BB#127: ## %cond.store125
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, 63(%rax)
+; AVX1-NEXT: LBB58_128: ## %else126
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_store_64xi8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: je LBB58_2
+; AVX2-NEXT: ## BB#1: ## %cond.store
+; AVX2-NEXT: vpextrb $0, %xmm0, (%rax)
+; AVX2-NEXT: LBB58_2: ## %else
+; AVX2-NEXT: testb $1, %sil
+; AVX2-NEXT: je LBB58_4
+; AVX2-NEXT: ## BB#3: ## %cond.store1
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rax)
+; AVX2-NEXT: LBB58_4: ## %else2
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_6
+; AVX2-NEXT: ## BB#5: ## %cond.store3
+; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rax)
+; AVX2-NEXT: LBB58_6: ## %else4
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_8
+; AVX2-NEXT: ## BB#7: ## %cond.store5
+; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rax)
+; AVX2-NEXT: LBB58_8: ## %else6
+; AVX2-NEXT: testb $1, %r8b
+; AVX2-NEXT: je LBB58_10
+; AVX2-NEXT: ## BB#9: ## %cond.store7
+; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rax)
+; AVX2-NEXT: LBB58_10: ## %else8
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %r9b
+; AVX2-NEXT: je LBB58_12
+; AVX2-NEXT: ## BB#11: ## %cond.store9
+; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rax)
+; AVX2-NEXT: LBB58_12: ## %else10
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_14
+; AVX2-NEXT: ## BB#13: ## %cond.store11
+; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rax)
+; AVX2-NEXT: LBB58_14: ## %else12
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_16
+; AVX2-NEXT: ## BB#15: ## %cond.store13
+; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rax)
+; AVX2-NEXT: LBB58_16: ## %else14
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_18
+; AVX2-NEXT: ## BB#17: ## %cond.store15
+; AVX2-NEXT: vpextrb $8, %xmm0, 8(%rax)
+; AVX2-NEXT: LBB58_18: ## %else16
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_20
+; AVX2-NEXT: ## BB#19: ## %cond.store17
+; AVX2-NEXT: vpextrb $9, %xmm0, 9(%rax)
+; AVX2-NEXT: LBB58_20: ## %else18
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_22
+; AVX2-NEXT: ## BB#21: ## %cond.store19
+; AVX2-NEXT: vpextrb $10, %xmm0, 10(%rax)
+; AVX2-NEXT: LBB58_22: ## %else20
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_24
+; AVX2-NEXT: ## BB#23: ## %cond.store21
+; AVX2-NEXT: vpextrb $11, %xmm0, 11(%rax)
+; AVX2-NEXT: LBB58_24: ## %else22
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_26
+; AVX2-NEXT: ## BB#25: ## %cond.store23
+; AVX2-NEXT: vpextrb $12, %xmm0, 12(%rax)
+; AVX2-NEXT: LBB58_26: ## %else24
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_28
+; AVX2-NEXT: ## BB#27: ## %cond.store25
+; AVX2-NEXT: vpextrb $13, %xmm0, 13(%rax)
+; AVX2-NEXT: LBB58_28: ## %else26
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_30
+; AVX2-NEXT: ## BB#29: ## %cond.store27
+; AVX2-NEXT: vpextrb $14, %xmm0, 14(%rax)
+; AVX2-NEXT: LBB58_30: ## %else28
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_32
+; AVX2-NEXT: ## BB#31: ## %cond.store29
+; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rax)
+; AVX2-NEXT: LBB58_32: ## %else30
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_34
+; AVX2-NEXT: ## BB#33: ## %cond.store31
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $0, %xmm2, 16(%rax)
+; AVX2-NEXT: LBB58_34: ## %else32
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_36
+; AVX2-NEXT: ## BB#35: ## %cond.store33
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $1, %xmm2, 17(%rax)
+; AVX2-NEXT: LBB58_36: ## %else34
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_38
+; AVX2-NEXT: ## BB#37: ## %cond.store35
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $2, %xmm2, 18(%rax)
+; AVX2-NEXT: LBB58_38: ## %else36
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_40
+; AVX2-NEXT: ## BB#39: ## %cond.store37
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $3, %xmm2, 19(%rax)
+; AVX2-NEXT: LBB58_40: ## %else38
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_42
+; AVX2-NEXT: ## BB#41: ## %cond.store39
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $4, %xmm2, 20(%rax)
+; AVX2-NEXT: LBB58_42: ## %else40
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_44
+; AVX2-NEXT: ## BB#43: ## %cond.store41
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $5, %xmm2, 21(%rax)
+; AVX2-NEXT: LBB58_44: ## %else42
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_46
+; AVX2-NEXT: ## BB#45: ## %cond.store43
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $6, %xmm2, 22(%rax)
+; AVX2-NEXT: LBB58_46: ## %else44
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_48
+; AVX2-NEXT: ## BB#47: ## %cond.store45
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $7, %xmm2, 23(%rax)
+; AVX2-NEXT: LBB58_48: ## %else46
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_50
+; AVX2-NEXT: ## BB#49: ## %cond.store47
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $8, %xmm2, 24(%rax)
+; AVX2-NEXT: LBB58_50: ## %else48
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_52
+; AVX2-NEXT: ## BB#51: ## %cond.store49
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $9, %xmm2, 25(%rax)
+; AVX2-NEXT: LBB58_52: ## %else50
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_54
+; AVX2-NEXT: ## BB#53: ## %cond.store51
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $10, %xmm2, 26(%rax)
+; AVX2-NEXT: LBB58_54: ## %else52
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_56
+; AVX2-NEXT: ## BB#55: ## %cond.store53
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $11, %xmm2, 27(%rax)
+; AVX2-NEXT: LBB58_56: ## %else54
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_58
+; AVX2-NEXT: ## BB#57: ## %cond.store55
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $12, %xmm2, 28(%rax)
+; AVX2-NEXT: LBB58_58: ## %else56
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_60
+; AVX2-NEXT: ## BB#59: ## %cond.store57
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $13, %xmm2, 29(%rax)
+; AVX2-NEXT: LBB58_60: ## %else58
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_62
+; AVX2-NEXT: ## BB#61: ## %cond.store59
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $14, %xmm2, 30(%rax)
+; AVX2-NEXT: LBB58_62: ## %else60
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_64
+; AVX2-NEXT: ## BB#63: ## %cond.store61
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrb $15, %xmm0, 31(%rax)
+; AVX2-NEXT: LBB58_64: ## %else62
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_66
+; AVX2-NEXT: ## BB#65: ## %cond.store63
+; AVX2-NEXT: vpextrb $0, %xmm1, 32(%rax)
+; AVX2-NEXT: LBB58_66: ## %else64
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_68
+; AVX2-NEXT: ## BB#67: ## %cond.store65
+; AVX2-NEXT: vpextrb $1, %xmm1, 33(%rax)
+; AVX2-NEXT: LBB58_68: ## %else66
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_70
+; AVX2-NEXT: ## BB#69: ## %cond.store67
+; AVX2-NEXT: vpextrb $2, %xmm1, 34(%rax)
+; AVX2-NEXT: LBB58_70: ## %else68
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_72
+; AVX2-NEXT: ## BB#71: ## %cond.store69
+; AVX2-NEXT: vpextrb $3, %xmm1, 35(%rax)
+; AVX2-NEXT: LBB58_72: ## %else70
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_74
+; AVX2-NEXT: ## BB#73: ## %cond.store71
+; AVX2-NEXT: vpextrb $4, %xmm1, 36(%rax)
+; AVX2-NEXT: LBB58_74: ## %else72
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_76
+; AVX2-NEXT: ## BB#75: ## %cond.store73
+; AVX2-NEXT: vpextrb $5, %xmm1, 37(%rax)
+; AVX2-NEXT: LBB58_76: ## %else74
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_78
+; AVX2-NEXT: ## BB#77: ## %cond.store75
+; AVX2-NEXT: vpextrb $6, %xmm1, 38(%rax)
+; AVX2-NEXT: LBB58_78: ## %else76
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_80
+; AVX2-NEXT: ## BB#79: ## %cond.store77
+; AVX2-NEXT: vpextrb $7, %xmm1, 39(%rax)
+; AVX2-NEXT: LBB58_80: ## %else78
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_82
+; AVX2-NEXT: ## BB#81: ## %cond.store79
+; AVX2-NEXT: vpextrb $8, %xmm1, 40(%rax)
+; AVX2-NEXT: LBB58_82: ## %else80
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_84
+; AVX2-NEXT: ## BB#83: ## %cond.store81
+; AVX2-NEXT: vpextrb $9, %xmm1, 41(%rax)
+; AVX2-NEXT: LBB58_84: ## %else82
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_86
+; AVX2-NEXT: ## BB#85: ## %cond.store83
+; AVX2-NEXT: vpextrb $10, %xmm1, 42(%rax)
+; AVX2-NEXT: LBB58_86: ## %else84
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_88
+; AVX2-NEXT: ## BB#87: ## %cond.store85
+; AVX2-NEXT: vpextrb $11, %xmm1, 43(%rax)
+; AVX2-NEXT: LBB58_88: ## %else86
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_90
+; AVX2-NEXT: ## BB#89: ## %cond.store87
+; AVX2-NEXT: vpextrb $12, %xmm1, 44(%rax)
+; AVX2-NEXT: LBB58_90: ## %else88
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_92
+; AVX2-NEXT: ## BB#91: ## %cond.store89
+; AVX2-NEXT: vpextrb $13, %xmm1, 45(%rax)
+; AVX2-NEXT: LBB58_92: ## %else90
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_94
+; AVX2-NEXT: ## BB#93: ## %cond.store91
+; AVX2-NEXT: vpextrb $14, %xmm1, 46(%rax)
+; AVX2-NEXT: LBB58_94: ## %else92
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_96
+; AVX2-NEXT: ## BB#95: ## %cond.store93
+; AVX2-NEXT: vpextrb $15, %xmm1, 47(%rax)
+; AVX2-NEXT: LBB58_96: ## %else94
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_98
+; AVX2-NEXT: ## BB#97: ## %cond.store95
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, 48(%rax)
+; AVX2-NEXT: LBB58_98: ## %else96
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_100
+; AVX2-NEXT: ## BB#99: ## %cond.store97
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $1, %xmm0, 49(%rax)
+; AVX2-NEXT: LBB58_100: ## %else98
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_102
+; AVX2-NEXT: ## BB#101: ## %cond.store99
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $2, %xmm0, 50(%rax)
+; AVX2-NEXT: LBB58_102: ## %else100
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_104
+; AVX2-NEXT: ## BB#103: ## %cond.store101
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $3, %xmm0, 51(%rax)
+; AVX2-NEXT: LBB58_104: ## %else102
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_106
+; AVX2-NEXT: ## BB#105: ## %cond.store103
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $4, %xmm0, 52(%rax)
+; AVX2-NEXT: LBB58_106: ## %else104
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_108
+; AVX2-NEXT: ## BB#107: ## %cond.store105
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $5, %xmm0, 53(%rax)
+; AVX2-NEXT: LBB58_108: ## %else106
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_110
+; AVX2-NEXT: ## BB#109: ## %cond.store107
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $6, %xmm0, 54(%rax)
+; AVX2-NEXT: LBB58_110: ## %else108
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_112
+; AVX2-NEXT: ## BB#111: ## %cond.store109
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $7, %xmm0, 55(%rax)
+; AVX2-NEXT: LBB58_112: ## %else110
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_114
+; AVX2-NEXT: ## BB#113: ## %cond.store111
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $8, %xmm0, 56(%rax)
+; AVX2-NEXT: LBB58_114: ## %else112
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_116
+; AVX2-NEXT: ## BB#115: ## %cond.store113
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $9, %xmm0, 57(%rax)
+; AVX2-NEXT: LBB58_116: ## %else114
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_118
+; AVX2-NEXT: ## BB#117: ## %cond.store115
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $10, %xmm0, 58(%rax)
+; AVX2-NEXT: LBB58_118: ## %else116
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_120
+; AVX2-NEXT: ## BB#119: ## %cond.store117
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $11, %xmm0, 59(%rax)
+; AVX2-NEXT: LBB58_120: ## %else118
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_122
+; AVX2-NEXT: ## BB#121: ## %cond.store119
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $12, %xmm0, 60(%rax)
+; AVX2-NEXT: LBB58_122: ## %else120
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_124
+; AVX2-NEXT: ## BB#123: ## %cond.store121
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $13, %xmm0, 61(%rax)
+; AVX2-NEXT: LBB58_124: ## %else122
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_126
+; AVX2-NEXT: ## BB#125: ## %cond.store123
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $14, %xmm0, 62(%rax)
+; AVX2-NEXT: LBB58_126: ## %else124
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_128
+; AVX2-NEXT: ## BB#127: ## %cond.store125
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $15, %xmm0, 63(%rax)
+; AVX2-NEXT: LBB58_128: ## %else126
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_64xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vpextrb $0, %xmm4, (%rdi)
+; AVX512F-NEXT: LBB58_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrb $1, %xmm4, 1(%rdi)
+; AVX512F-NEXT: LBB58_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrb $2, %xmm4, 2(%rdi)
+; AVX512F-NEXT: LBB58_6: ## %else4
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrb $3, %xmm4, 3(%rdi)
+; AVX512F-NEXT: LBB58_8: ## %else6
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrb $4, %xmm4, 4(%rdi)
+; AVX512F-NEXT: LBB58_10: ## %else8
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrb $5, %xmm4, 5(%rdi)
+; AVX512F-NEXT: LBB58_12: ## %else10
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrb $6, %xmm4, 6(%rdi)
+; AVX512F-NEXT: LBB58_14: ## %else12
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrb $7, %xmm4, 7(%rdi)
+; AVX512F-NEXT: LBB58_16: ## %else14
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vpextrb $8, %xmm4, 8(%rdi)
+; AVX512F-NEXT: LBB58_18: ## %else16
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vpextrb $9, %xmm4, 9(%rdi)
+; AVX512F-NEXT: LBB58_20: ## %else18
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vpextrb $10, %xmm4, 10(%rdi)
+; AVX512F-NEXT: LBB58_22: ## %else20
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vpextrb $11, %xmm4, 11(%rdi)
+; AVX512F-NEXT: LBB58_24: ## %else22
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vpextrb $12, %xmm4, 12(%rdi)
+; AVX512F-NEXT: LBB58_26: ## %else24
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm0
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vpextrb $13, %xmm4, 13(%rdi)
+; AVX512F-NEXT: LBB58_28: ## %else26
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vpextrb $14, %xmm4, 14(%rdi)
+; AVX512F-NEXT: LBB58_30: ## %else28
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vpextrb $15, %xmm4, 15(%rdi)
+; AVX512F-NEXT: LBB58_32: ## %else30
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_34
+; AVX512F-NEXT: ## BB#33: ## %cond.store31
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm0, 16(%rdi)
+; AVX512F-NEXT: LBB58_34: ## %else32
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_36
+; AVX512F-NEXT: ## BB#35: ## %cond.store33
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $1, %xmm0, 17(%rdi)
+; AVX512F-NEXT: LBB58_36: ## %else34
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_38
+; AVX512F-NEXT: ## BB#37: ## %cond.store35
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $2, %xmm0, 18(%rdi)
+; AVX512F-NEXT: LBB58_38: ## %else36
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_40
+; AVX512F-NEXT: ## BB#39: ## %cond.store37
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $3, %xmm0, 19(%rdi)
+; AVX512F-NEXT: LBB58_40: ## %else38
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_42
+; AVX512F-NEXT: ## BB#41: ## %cond.store39
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $4, %xmm0, 20(%rdi)
+; AVX512F-NEXT: LBB58_42: ## %else40
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_44
+; AVX512F-NEXT: ## BB#43: ## %cond.store41
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $5, %xmm0, 21(%rdi)
+; AVX512F-NEXT: LBB58_44: ## %else42
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_46
+; AVX512F-NEXT: ## BB#45: ## %cond.store43
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $6, %xmm0, 22(%rdi)
+; AVX512F-NEXT: LBB58_46: ## %else44
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_48
+; AVX512F-NEXT: ## BB#47: ## %cond.store45
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $7, %xmm0, 23(%rdi)
+; AVX512F-NEXT: LBB58_48: ## %else46
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_50
+; AVX512F-NEXT: ## BB#49: ## %cond.store47
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $8, %xmm0, 24(%rdi)
+; AVX512F-NEXT: LBB58_50: ## %else48
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_52
+; AVX512F-NEXT: ## BB#51: ## %cond.store49
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $9, %xmm0, 25(%rdi)
+; AVX512F-NEXT: LBB58_52: ## %else50
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_54
+; AVX512F-NEXT: ## BB#53: ## %cond.store51
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $10, %xmm0, 26(%rdi)
+; AVX512F-NEXT: LBB58_54: ## %else52
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_56
+; AVX512F-NEXT: ## BB#55: ## %cond.store53
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $11, %xmm0, 27(%rdi)
+; AVX512F-NEXT: LBB58_56: ## %else54
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_58
+; AVX512F-NEXT: ## BB#57: ## %cond.store55
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $12, %xmm0, 28(%rdi)
+; AVX512F-NEXT: LBB58_58: ## %else56
+; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm0
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_60
+; AVX512F-NEXT: ## BB#59: ## %cond.store57
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm1
+; AVX512F-NEXT: vpextrb $13, %xmm1, 29(%rdi)
+; AVX512F-NEXT: LBB58_60: ## %else58
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_62
+; AVX512F-NEXT: ## BB#61: ## %cond.store59
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm1
+; AVX512F-NEXT: vpextrb $14, %xmm1, 30(%rdi)
+; AVX512F-NEXT: LBB58_62: ## %else60
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $0, %k1, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_64
+; AVX512F-NEXT: ## BB#63: ## %cond.store61
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $15, %xmm0, 31(%rdi)
+; AVX512F-NEXT: LBB58_64: ## %else62
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_66
+; AVX512F-NEXT: ## BB#65: ## %cond.store63
+; AVX512F-NEXT: vpextrb $0, %xmm5, 32(%rdi)
+; AVX512F-NEXT: LBB58_66: ## %else64
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_68
+; AVX512F-NEXT: ## BB#67: ## %cond.store65
+; AVX512F-NEXT: vpextrb $1, %xmm5, 33(%rdi)
+; AVX512F-NEXT: LBB58_68: ## %else66
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_70
+; AVX512F-NEXT: ## BB#69: ## %cond.store67
+; AVX512F-NEXT: vpextrb $2, %xmm5, 34(%rdi)
+; AVX512F-NEXT: LBB58_70: ## %else68
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_72
+; AVX512F-NEXT: ## BB#71: ## %cond.store69
+; AVX512F-NEXT: vpextrb $3, %xmm5, 35(%rdi)
+; AVX512F-NEXT: LBB58_72: ## %else70
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_74
+; AVX512F-NEXT: ## BB#73: ## %cond.store71
+; AVX512F-NEXT: vpextrb $4, %xmm5, 36(%rdi)
+; AVX512F-NEXT: LBB58_74: ## %else72
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_76
+; AVX512F-NEXT: ## BB#75: ## %cond.store73
+; AVX512F-NEXT: vpextrb $5, %xmm5, 37(%rdi)
+; AVX512F-NEXT: LBB58_76: ## %else74
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_78
+; AVX512F-NEXT: ## BB#77: ## %cond.store75
+; AVX512F-NEXT: vpextrb $6, %xmm5, 38(%rdi)
+; AVX512F-NEXT: LBB58_78: ## %else76
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_80
+; AVX512F-NEXT: ## BB#79: ## %cond.store77
+; AVX512F-NEXT: vpextrb $7, %xmm5, 39(%rdi)
+; AVX512F-NEXT: LBB58_80: ## %else78
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_82
+; AVX512F-NEXT: ## BB#81: ## %cond.store79
+; AVX512F-NEXT: vpextrb $8, %xmm5, 40(%rdi)
+; AVX512F-NEXT: LBB58_82: ## %else80
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_84
+; AVX512F-NEXT: ## BB#83: ## %cond.store81
+; AVX512F-NEXT: vpextrb $9, %xmm5, 41(%rdi)
+; AVX512F-NEXT: LBB58_84: ## %else82
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_86
+; AVX512F-NEXT: ## BB#85: ## %cond.store83
+; AVX512F-NEXT: vpextrb $10, %xmm5, 42(%rdi)
+; AVX512F-NEXT: LBB58_86: ## %else84
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_88
+; AVX512F-NEXT: ## BB#87: ## %cond.store85
+; AVX512F-NEXT: vpextrb $11, %xmm5, 43(%rdi)
+; AVX512F-NEXT: LBB58_88: ## %else86
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_90
+; AVX512F-NEXT: ## BB#89: ## %cond.store87
+; AVX512F-NEXT: vpextrb $12, %xmm5, 44(%rdi)
+; AVX512F-NEXT: LBB58_90: ## %else88
+; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm0
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_92
+; AVX512F-NEXT: ## BB#91: ## %cond.store89
+; AVX512F-NEXT: vpextrb $13, %xmm5, 45(%rdi)
+; AVX512F-NEXT: LBB58_92: ## %else90
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_94
+; AVX512F-NEXT: ## BB#93: ## %cond.store91
+; AVX512F-NEXT: vpextrb $14, %xmm5, 46(%rdi)
+; AVX512F-NEXT: LBB58_94: ## %else92
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_96
+; AVX512F-NEXT: ## BB#95: ## %cond.store93
+; AVX512F-NEXT: vpextrb $15, %xmm5, 47(%rdi)
+; AVX512F-NEXT: LBB58_96: ## %else94
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_98
+; AVX512F-NEXT: ## BB#97: ## %cond.store95
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm0, 48(%rdi)
+; AVX512F-NEXT: LBB58_98: ## %else96
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_100
+; AVX512F-NEXT: ## BB#99: ## %cond.store97
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $1, %xmm0, 49(%rdi)
+; AVX512F-NEXT: LBB58_100: ## %else98
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_102
+; AVX512F-NEXT: ## BB#101: ## %cond.store99
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $2, %xmm0, 50(%rdi)
+; AVX512F-NEXT: LBB58_102: ## %else100
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_104
+; AVX512F-NEXT: ## BB#103: ## %cond.store101
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $3, %xmm0, 51(%rdi)
+; AVX512F-NEXT: LBB58_104: ## %else102
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_106
+; AVX512F-NEXT: ## BB#105: ## %cond.store103
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $4, %xmm0, 52(%rdi)
+; AVX512F-NEXT: LBB58_106: ## %else104
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_108
+; AVX512F-NEXT: ## BB#107: ## %cond.store105
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $5, %xmm0, 53(%rdi)
+; AVX512F-NEXT: LBB58_108: ## %else106
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_110
+; AVX512F-NEXT: ## BB#109: ## %cond.store107
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $6, %xmm0, 54(%rdi)
+; AVX512F-NEXT: LBB58_110: ## %else108
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_112
+; AVX512F-NEXT: ## BB#111: ## %cond.store109
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $7, %xmm0, 55(%rdi)
+; AVX512F-NEXT: LBB58_112: ## %else110
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_114
+; AVX512F-NEXT: ## BB#113: ## %cond.store111
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $8, %xmm0, 56(%rdi)
+; AVX512F-NEXT: LBB58_114: ## %else112
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_116
+; AVX512F-NEXT: ## BB#115: ## %cond.store113
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $9, %xmm0, 57(%rdi)
+; AVX512F-NEXT: LBB58_116: ## %else114
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_118
+; AVX512F-NEXT: ## BB#117: ## %cond.store115
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $10, %xmm0, 58(%rdi)
+; AVX512F-NEXT: LBB58_118: ## %else116
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_120
+; AVX512F-NEXT: ## BB#119: ## %cond.store117
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $11, %xmm0, 59(%rdi)
+; AVX512F-NEXT: LBB58_120: ## %else118
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_122
+; AVX512F-NEXT: ## BB#121: ## %cond.store119
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $12, %xmm0, 60(%rdi)
+; AVX512F-NEXT: LBB58_122: ## %else120
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_124
+; AVX512F-NEXT: ## BB#123: ## %cond.store121
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $13, %xmm0, 61(%rdi)
+; AVX512F-NEXT: LBB58_124: ## %else122
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_126
+; AVX512F-NEXT: ## BB#125: ## %cond.store123
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $14, %xmm0, 62(%rdi)
+; AVX512F-NEXT: LBB58_126: ## %else124
+; AVX512F-NEXT: kshiftlw $0, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_128
+; AVX512F-NEXT: ## BB#127: ## %cond.store125
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $15, %xmm0, 63(%rdi)
+; AVX512F-NEXT: LBB58_128: ## %else126
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_mask_store_64xi8:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %zmm0, %zmm0
declare void @llvm.masked.store.v64i8.p0v64i8(<64 x i8>, <64 x i8>*, i32, <64 x i1>)
define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
+; AVX-LABEL: test_mask_store_8xi16:
+; AVX: ## BB#0:
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_2
+; AVX-NEXT: ## BB#1: ## %cond.store
+; AVX-NEXT: vmovd %xmm1, %eax
+; AVX-NEXT: movw %ax, (%rdi)
+; AVX-NEXT: LBB59_2: ## %else
+; AVX-NEXT: vpextrb $2, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_4
+; AVX-NEXT: ## BB#3: ## %cond.store1
+; AVX-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX-NEXT: LBB59_4: ## %else2
+; AVX-NEXT: vpextrb $4, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_6
+; AVX-NEXT: ## BB#5: ## %cond.store3
+; AVX-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX-NEXT: LBB59_6: ## %else4
+; AVX-NEXT: vpextrb $6, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_8
+; AVX-NEXT: ## BB#7: ## %cond.store5
+; AVX-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX-NEXT: LBB59_8: ## %else6
+; AVX-NEXT: vpextrb $8, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_10
+; AVX-NEXT: ## BB#9: ## %cond.store7
+; AVX-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX-NEXT: LBB59_10: ## %else8
+; AVX-NEXT: vpextrb $10, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_12
+; AVX-NEXT: ## BB#11: ## %cond.store9
+; AVX-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX-NEXT: LBB59_12: ## %else10
+; AVX-NEXT: vpextrb $12, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_14
+; AVX-NEXT: ## BB#13: ## %cond.store11
+; AVX-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX-NEXT: LBB59_14: ## %else12
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_16
+; AVX-NEXT: ## BB#15: ## %cond.store13
+; AVX-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX-NEXT: LBB59_16: ## %else14
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_8xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: movw %ax, (%rdi)
+; AVX512F-NEXT: LBB59_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB59_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB59_6: ## %else4
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB59_8: ## %else6
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB59_10: ## %else8
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB59_12: ## %else10
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB59_14: ## %else12
+; AVX512F-NEXT: kshiftlw $8, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB59_16: ## %else14
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_mask_store_8xi16:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
+; AVX1-LABEL: test_mask_store_16xi16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_2
+; AVX1-NEXT: ## BB#1: ## %cond.store
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movw %ax, (%rdi)
+; AVX1-NEXT: LBB60_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_4
+; AVX1-NEXT: ## BB#3: ## %cond.store1
+; AVX1-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX1-NEXT: LBB60_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_6
+; AVX1-NEXT: ## BB#5: ## %cond.store3
+; AVX1-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX1-NEXT: LBB60_6: ## %else4
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_8
+; AVX1-NEXT: ## BB#7: ## %cond.store5
+; AVX1-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX1-NEXT: LBB60_8: ## %else6
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_10
+; AVX1-NEXT: ## BB#9: ## %cond.store7
+; AVX1-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX1-NEXT: LBB60_10: ## %else8
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_12
+; AVX1-NEXT: ## BB#11: ## %cond.store9
+; AVX1-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX1-NEXT: LBB60_12: ## %else10
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_14
+; AVX1-NEXT: ## BB#13: ## %cond.store11
+; AVX1-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX1-NEXT: LBB60_14: ## %else12
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_16
+; AVX1-NEXT: ## BB#15: ## %cond.store13
+; AVX1-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX1-NEXT: LBB60_16: ## %else14
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_18
+; AVX1-NEXT: ## BB#17: ## %cond.store15
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: movw %ax, 16(%rdi)
+; AVX1-NEXT: LBB60_18: ## %else16
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_20
+; AVX1-NEXT: ## BB#19: ## %cond.store17
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $1, %xmm2, 18(%rdi)
+; AVX1-NEXT: LBB60_20: ## %else18
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_22
+; AVX1-NEXT: ## BB#21: ## %cond.store19
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $2, %xmm2, 20(%rdi)
+; AVX1-NEXT: LBB60_22: ## %else20
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_24
+; AVX1-NEXT: ## BB#23: ## %cond.store21
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $3, %xmm2, 22(%rdi)
+; AVX1-NEXT: LBB60_24: ## %else22
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_26
+; AVX1-NEXT: ## BB#25: ## %cond.store23
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $4, %xmm2, 24(%rdi)
+; AVX1-NEXT: LBB60_26: ## %else24
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_28
+; AVX1-NEXT: ## BB#27: ## %cond.store25
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $5, %xmm2, 26(%rdi)
+; AVX1-NEXT: LBB60_28: ## %else26
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_30
+; AVX1-NEXT: ## BB#29: ## %cond.store27
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $6, %xmm2, 28(%rdi)
+; AVX1-NEXT: LBB60_30: ## %else28
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_32
+; AVX1-NEXT: ## BB#31: ## %cond.store29
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrw $7, %xmm0, 30(%rdi)
+; AVX1-NEXT: LBB60_32: ## %else30
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_store_16xi16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_2
+; AVX2-NEXT: ## BB#1: ## %cond.store
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movw %ax, (%rdi)
+; AVX2-NEXT: LBB60_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_4
+; AVX2-NEXT: ## BB#3: ## %cond.store1
+; AVX2-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX2-NEXT: LBB60_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_6
+; AVX2-NEXT: ## BB#5: ## %cond.store3
+; AVX2-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX2-NEXT: LBB60_6: ## %else4
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_8
+; AVX2-NEXT: ## BB#7: ## %cond.store5
+; AVX2-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX2-NEXT: LBB60_8: ## %else6
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_10
+; AVX2-NEXT: ## BB#9: ## %cond.store7
+; AVX2-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX2-NEXT: LBB60_10: ## %else8
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_12
+; AVX2-NEXT: ## BB#11: ## %cond.store9
+; AVX2-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX2-NEXT: LBB60_12: ## %else10
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_14
+; AVX2-NEXT: ## BB#13: ## %cond.store11
+; AVX2-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX2-NEXT: LBB60_14: ## %else12
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_16
+; AVX2-NEXT: ## BB#15: ## %cond.store13
+; AVX2-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX2-NEXT: LBB60_16: ## %else14
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_18
+; AVX2-NEXT: ## BB#17: ## %cond.store15
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: movw %ax, 16(%rdi)
+; AVX2-NEXT: LBB60_18: ## %else16
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_20
+; AVX2-NEXT: ## BB#19: ## %cond.store17
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $1, %xmm2, 18(%rdi)
+; AVX2-NEXT: LBB60_20: ## %else18
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_22
+; AVX2-NEXT: ## BB#21: ## %cond.store19
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $2, %xmm2, 20(%rdi)
+; AVX2-NEXT: LBB60_22: ## %else20
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_24
+; AVX2-NEXT: ## BB#23: ## %cond.store21
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $3, %xmm2, 22(%rdi)
+; AVX2-NEXT: LBB60_24: ## %else22
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_26
+; AVX2-NEXT: ## BB#25: ## %cond.store23
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $4, %xmm2, 24(%rdi)
+; AVX2-NEXT: LBB60_26: ## %else24
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_28
+; AVX2-NEXT: ## BB#27: ## %cond.store25
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $5, %xmm2, 26(%rdi)
+; AVX2-NEXT: LBB60_28: ## %else26
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_30
+; AVX2-NEXT: ## BB#29: ## %cond.store27
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $6, %xmm2, 28(%rdi)
+; AVX2-NEXT: LBB60_30: ## %else28
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_32
+; AVX2-NEXT: ## BB#31: ## %cond.store29
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrw $7, %xmm0, 30(%rdi)
+; AVX2-NEXT: LBB60_32: ## %else30
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_16xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: movw %ax, (%rdi)
+; AVX512F-NEXT: LBB60_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB60_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB60_6: ## %else4
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB60_8: ## %else6
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB60_10: ## %else8
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB60_12: ## %else10
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB60_14: ## %else12
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB60_16: ## %else14
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: movw %ax, 16(%rdi)
+; AVX512F-NEXT: LBB60_18: ## %else16
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $1, %xmm0, 18(%rdi)
+; AVX512F-NEXT: LBB60_20: ## %else18
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $2, %xmm0, 20(%rdi)
+; AVX512F-NEXT: LBB60_22: ## %else20
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $3, %xmm0, 22(%rdi)
+; AVX512F-NEXT: LBB60_24: ## %else22
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $4, %xmm0, 24(%rdi)
+; AVX512F-NEXT: LBB60_26: ## %else24
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $5, %xmm0, 26(%rdi)
+; AVX512F-NEXT: LBB60_28: ## %else26
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $6, %xmm0, 28(%rdi)
+; AVX512F-NEXT: LBB60_30: ## %else28
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $7, %xmm0, 30(%rdi)
+; AVX512F-NEXT: LBB60_32: ## %else30
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_mask_store_16xi16:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
declare void @llvm.masked.store.v16i16.p0v16i16(<16 x i16>, <16 x i16>*, i32, <16 x i1>)
define void @test_mask_store_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) {
+; AVX1-LABEL: test_mask_store_32xi16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_2
+; AVX1-NEXT: ## BB#1: ## %cond.store
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movw %ax, (%rdi)
+; AVX1-NEXT: LBB61_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_4
+; AVX1-NEXT: ## BB#3: ## %cond.store1
+; AVX1-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX1-NEXT: LBB61_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_6
+; AVX1-NEXT: ## BB#5: ## %cond.store3
+; AVX1-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX1-NEXT: LBB61_6: ## %else4
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_8
+; AVX1-NEXT: ## BB#7: ## %cond.store5
+; AVX1-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX1-NEXT: LBB61_8: ## %else6
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_10
+; AVX1-NEXT: ## BB#9: ## %cond.store7
+; AVX1-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX1-NEXT: LBB61_10: ## %else8
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_12
+; AVX1-NEXT: ## BB#11: ## %cond.store9
+; AVX1-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX1-NEXT: LBB61_12: ## %else10
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_14
+; AVX1-NEXT: ## BB#13: ## %cond.store11
+; AVX1-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX1-NEXT: LBB61_14: ## %else12
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_16
+; AVX1-NEXT: ## BB#15: ## %cond.store13
+; AVX1-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX1-NEXT: LBB61_16: ## %else14
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_18
+; AVX1-NEXT: ## BB#17: ## %cond.store15
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vmovd %xmm3, %eax
+; AVX1-NEXT: movw %ax, 16(%rdi)
+; AVX1-NEXT: LBB61_18: ## %else16
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_20
+; AVX1-NEXT: ## BB#19: ## %cond.store17
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $1, %xmm3, 18(%rdi)
+; AVX1-NEXT: LBB61_20: ## %else18
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_22
+; AVX1-NEXT: ## BB#21: ## %cond.store19
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $2, %xmm3, 20(%rdi)
+; AVX1-NEXT: LBB61_22: ## %else20
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_24
+; AVX1-NEXT: ## BB#23: ## %cond.store21
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $3, %xmm3, 22(%rdi)
+; AVX1-NEXT: LBB61_24: ## %else22
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_26
+; AVX1-NEXT: ## BB#25: ## %cond.store23
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $4, %xmm3, 24(%rdi)
+; AVX1-NEXT: LBB61_26: ## %else24
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_28
+; AVX1-NEXT: ## BB#27: ## %cond.store25
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $5, %xmm3, 26(%rdi)
+; AVX1-NEXT: LBB61_28: ## %else26
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_30
+; AVX1-NEXT: ## BB#29: ## %cond.store27
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $6, %xmm3, 28(%rdi)
+; AVX1-NEXT: LBB61_30: ## %else28
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_32
+; AVX1-NEXT: ## BB#31: ## %cond.store29
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpextrw $7, %xmm1, 30(%rdi)
+; AVX1-NEXT: LBB61_32: ## %else30
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_34
+; AVX1-NEXT: ## BB#33: ## %cond.store31
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: movw %ax, 32(%rdi)
+; AVX1-NEXT: LBB61_34: ## %else32
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_36
+; AVX1-NEXT: ## BB#35: ## %cond.store33
+; AVX1-NEXT: vpextrw $1, %xmm2, 34(%rdi)
+; AVX1-NEXT: LBB61_36: ## %else34
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_38
+; AVX1-NEXT: ## BB#37: ## %cond.store35
+; AVX1-NEXT: vpextrw $2, %xmm2, 36(%rdi)
+; AVX1-NEXT: LBB61_38: ## %else36
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_40
+; AVX1-NEXT: ## BB#39: ## %cond.store37
+; AVX1-NEXT: vpextrw $3, %xmm2, 38(%rdi)
+; AVX1-NEXT: LBB61_40: ## %else38
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_42
+; AVX1-NEXT: ## BB#41: ## %cond.store39
+; AVX1-NEXT: vpextrw $4, %xmm2, 40(%rdi)
+; AVX1-NEXT: LBB61_42: ## %else40
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_44
+; AVX1-NEXT: ## BB#43: ## %cond.store41
+; AVX1-NEXT: vpextrw $5, %xmm2, 42(%rdi)
+; AVX1-NEXT: LBB61_44: ## %else42
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_46
+; AVX1-NEXT: ## BB#45: ## %cond.store43
+; AVX1-NEXT: vpextrw $6, %xmm2, 44(%rdi)
+; AVX1-NEXT: LBB61_46: ## %else44
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_48
+; AVX1-NEXT: ## BB#47: ## %cond.store45
+; AVX1-NEXT: vpextrw $7, %xmm2, 46(%rdi)
+; AVX1-NEXT: LBB61_48: ## %else46
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_50
+; AVX1-NEXT: ## BB#49: ## %cond.store47
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movw %ax, 48(%rdi)
+; AVX1-NEXT: LBB61_50: ## %else48
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_52
+; AVX1-NEXT: ## BB#51: ## %cond.store49
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $1, %xmm1, 50(%rdi)
+; AVX1-NEXT: LBB61_52: ## %else50
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_54
+; AVX1-NEXT: ## BB#53: ## %cond.store51
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $2, %xmm1, 52(%rdi)
+; AVX1-NEXT: LBB61_54: ## %else52
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_56
+; AVX1-NEXT: ## BB#55: ## %cond.store53
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $3, %xmm1, 54(%rdi)
+; AVX1-NEXT: LBB61_56: ## %else54
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_58
+; AVX1-NEXT: ## BB#57: ## %cond.store55
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $4, %xmm1, 56(%rdi)
+; AVX1-NEXT: LBB61_58: ## %else56
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_60
+; AVX1-NEXT: ## BB#59: ## %cond.store57
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $5, %xmm1, 58(%rdi)
+; AVX1-NEXT: LBB61_60: ## %else58
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_62
+; AVX1-NEXT: ## BB#61: ## %cond.store59
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $6, %xmm1, 60(%rdi)
+; AVX1-NEXT: LBB61_62: ## %else60
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_64
+; AVX1-NEXT: ## BB#63: ## %cond.store61
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT: vpextrw $7, %xmm0, 62(%rdi)
+; AVX1-NEXT: LBB61_64: ## %else62
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_store_32xi16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_2
+; AVX2-NEXT: ## BB#1: ## %cond.store
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movw %ax, (%rdi)
+; AVX2-NEXT: LBB61_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_4
+; AVX2-NEXT: ## BB#3: ## %cond.store1
+; AVX2-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX2-NEXT: LBB61_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_6
+; AVX2-NEXT: ## BB#5: ## %cond.store3
+; AVX2-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX2-NEXT: LBB61_6: ## %else4
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_8
+; AVX2-NEXT: ## BB#7: ## %cond.store5
+; AVX2-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX2-NEXT: LBB61_8: ## %else6
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_10
+; AVX2-NEXT: ## BB#9: ## %cond.store7
+; AVX2-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX2-NEXT: LBB61_10: ## %else8
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_12
+; AVX2-NEXT: ## BB#11: ## %cond.store9
+; AVX2-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX2-NEXT: LBB61_12: ## %else10
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_14
+; AVX2-NEXT: ## BB#13: ## %cond.store11
+; AVX2-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX2-NEXT: LBB61_14: ## %else12
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_16
+; AVX2-NEXT: ## BB#15: ## %cond.store13
+; AVX2-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX2-NEXT: LBB61_16: ## %else14
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_18
+; AVX2-NEXT: ## BB#17: ## %cond.store15
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vmovd %xmm3, %eax
+; AVX2-NEXT: movw %ax, 16(%rdi)
+; AVX2-NEXT: LBB61_18: ## %else16
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_20
+; AVX2-NEXT: ## BB#19: ## %cond.store17
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $1, %xmm3, 18(%rdi)
+; AVX2-NEXT: LBB61_20: ## %else18
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_22
+; AVX2-NEXT: ## BB#21: ## %cond.store19
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $2, %xmm3, 20(%rdi)
+; AVX2-NEXT: LBB61_22: ## %else20
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_24
+; AVX2-NEXT: ## BB#23: ## %cond.store21
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $3, %xmm3, 22(%rdi)
+; AVX2-NEXT: LBB61_24: ## %else22
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_26
+; AVX2-NEXT: ## BB#25: ## %cond.store23
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $4, %xmm3, 24(%rdi)
+; AVX2-NEXT: LBB61_26: ## %else24
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_28
+; AVX2-NEXT: ## BB#27: ## %cond.store25
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $5, %xmm3, 26(%rdi)
+; AVX2-NEXT: LBB61_28: ## %else26
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_30
+; AVX2-NEXT: ## BB#29: ## %cond.store27
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $6, %xmm3, 28(%rdi)
+; AVX2-NEXT: LBB61_30: ## %else28
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_32
+; AVX2-NEXT: ## BB#31: ## %cond.store29
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vpextrw $7, %xmm1, 30(%rdi)
+; AVX2-NEXT: LBB61_32: ## %else30
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_34
+; AVX2-NEXT: ## BB#33: ## %cond.store31
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: movw %ax, 32(%rdi)
+; AVX2-NEXT: LBB61_34: ## %else32
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_36
+; AVX2-NEXT: ## BB#35: ## %cond.store33
+; AVX2-NEXT: vpextrw $1, %xmm2, 34(%rdi)
+; AVX2-NEXT: LBB61_36: ## %else34
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_38
+; AVX2-NEXT: ## BB#37: ## %cond.store35
+; AVX2-NEXT: vpextrw $2, %xmm2, 36(%rdi)
+; AVX2-NEXT: LBB61_38: ## %else36
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_40
+; AVX2-NEXT: ## BB#39: ## %cond.store37
+; AVX2-NEXT: vpextrw $3, %xmm2, 38(%rdi)
+; AVX2-NEXT: LBB61_40: ## %else38
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_42
+; AVX2-NEXT: ## BB#41: ## %cond.store39
+; AVX2-NEXT: vpextrw $4, %xmm2, 40(%rdi)
+; AVX2-NEXT: LBB61_42: ## %else40
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_44
+; AVX2-NEXT: ## BB#43: ## %cond.store41
+; AVX2-NEXT: vpextrw $5, %xmm2, 42(%rdi)
+; AVX2-NEXT: LBB61_44: ## %else42
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_46
+; AVX2-NEXT: ## BB#45: ## %cond.store43
+; AVX2-NEXT: vpextrw $6, %xmm2, 44(%rdi)
+; AVX2-NEXT: LBB61_46: ## %else44
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_48
+; AVX2-NEXT: ## BB#47: ## %cond.store45
+; AVX2-NEXT: vpextrw $7, %xmm2, 46(%rdi)
+; AVX2-NEXT: LBB61_48: ## %else46
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_50
+; AVX2-NEXT: ## BB#49: ## %cond.store47
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movw %ax, 48(%rdi)
+; AVX2-NEXT: LBB61_50: ## %else48
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_52
+; AVX2-NEXT: ## BB#51: ## %cond.store49
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $1, %xmm1, 50(%rdi)
+; AVX2-NEXT: LBB61_52: ## %else50
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_54
+; AVX2-NEXT: ## BB#53: ## %cond.store51
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $2, %xmm1, 52(%rdi)
+; AVX2-NEXT: LBB61_54: ## %else52
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_56
+; AVX2-NEXT: ## BB#55: ## %cond.store53
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $3, %xmm1, 54(%rdi)
+; AVX2-NEXT: LBB61_56: ## %else54
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_58
+; AVX2-NEXT: ## BB#57: ## %cond.store55
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $4, %xmm1, 56(%rdi)
+; AVX2-NEXT: LBB61_58: ## %else56
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_60
+; AVX2-NEXT: ## BB#59: ## %cond.store57
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $5, %xmm1, 58(%rdi)
+; AVX2-NEXT: LBB61_60: ## %else58
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_62
+; AVX2-NEXT: ## BB#61: ## %cond.store59
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $6, %xmm1, 60(%rdi)
+; AVX2-NEXT: LBB61_62: ## %else60
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_64
+; AVX2-NEXT: ## BB#63: ## %cond.store61
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX2-NEXT: vpextrw $7, %xmm0, 62(%rdi)
+; AVX2-NEXT: LBB61_64: ## %else62
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_32xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: movw %ax, (%rdi)
+; AVX512F-NEXT: LBB61_2: ## %else
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB61_4: ## %else2
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB61_6: ## %else4
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB61_8: ## %else6
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB61_10: ## %else8
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB61_12: ## %else10
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB61_14: ## %else12
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB61_16: ## %else14
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vmovd %xmm3, %eax
+; AVX512F-NEXT: movw %ax, 16(%rdi)
+; AVX512F-NEXT: LBB61_18: ## %else16
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $1, %xmm3, 18(%rdi)
+; AVX512F-NEXT: LBB61_20: ## %else18
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $2, %xmm3, 20(%rdi)
+; AVX512F-NEXT: LBB61_22: ## %else20
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $3, %xmm3, 22(%rdi)
+; AVX512F-NEXT: LBB61_24: ## %else22
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $4, %xmm3, 24(%rdi)
+; AVX512F-NEXT: LBB61_26: ## %else24
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $5, %xmm3, 26(%rdi)
+; AVX512F-NEXT: LBB61_28: ## %else26
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $6, %xmm3, 28(%rdi)
+; AVX512F-NEXT: LBB61_30: ## %else28
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vpextrw $7, %xmm1, 30(%rdi)
+; AVX512F-NEXT: LBB61_32: ## %else30
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_34
+; AVX512F-NEXT: ## BB#33: ## %cond.store31
+; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: movw %ax, 32(%rdi)
+; AVX512F-NEXT: LBB61_34: ## %else32
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_36
+; AVX512F-NEXT: ## BB#35: ## %cond.store33
+; AVX512F-NEXT: vpextrw $1, %xmm2, 34(%rdi)
+; AVX512F-NEXT: LBB61_36: ## %else34
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_38
+; AVX512F-NEXT: ## BB#37: ## %cond.store35
+; AVX512F-NEXT: vpextrw $2, %xmm2, 36(%rdi)
+; AVX512F-NEXT: LBB61_38: ## %else36
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_40
+; AVX512F-NEXT: ## BB#39: ## %cond.store37
+; AVX512F-NEXT: vpextrw $3, %xmm2, 38(%rdi)
+; AVX512F-NEXT: LBB61_40: ## %else38
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_42
+; AVX512F-NEXT: ## BB#41: ## %cond.store39
+; AVX512F-NEXT: vpextrw $4, %xmm2, 40(%rdi)
+; AVX512F-NEXT: LBB61_42: ## %else40
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_44
+; AVX512F-NEXT: ## BB#43: ## %cond.store41
+; AVX512F-NEXT: vpextrw $5, %xmm2, 42(%rdi)
+; AVX512F-NEXT: LBB61_44: ## %else42
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_46
+; AVX512F-NEXT: ## BB#45: ## %cond.store43
+; AVX512F-NEXT: vpextrw $6, %xmm2, 44(%rdi)
+; AVX512F-NEXT: LBB61_46: ## %else44
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_48
+; AVX512F-NEXT: ## BB#47: ## %cond.store45
+; AVX512F-NEXT: vpextrw $7, %xmm2, 46(%rdi)
+; AVX512F-NEXT: LBB61_48: ## %else46
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_50
+; AVX512F-NEXT: ## BB#49: ## %cond.store47
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: movw %ax, 48(%rdi)
+; AVX512F-NEXT: LBB61_50: ## %else48
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_52
+; AVX512F-NEXT: ## BB#51: ## %cond.store49
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $1, %xmm1, 50(%rdi)
+; AVX512F-NEXT: LBB61_52: ## %else50
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_54
+; AVX512F-NEXT: ## BB#53: ## %cond.store51
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $2, %xmm1, 52(%rdi)
+; AVX512F-NEXT: LBB61_54: ## %else52
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_56
+; AVX512F-NEXT: ## BB#55: ## %cond.store53
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $3, %xmm1, 54(%rdi)
+; AVX512F-NEXT: LBB61_56: ## %else54
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_58
+; AVX512F-NEXT: ## BB#57: ## %cond.store55
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $4, %xmm1, 56(%rdi)
+; AVX512F-NEXT: LBB61_58: ## %else56
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_60
+; AVX512F-NEXT: ## BB#59: ## %cond.store57
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $5, %xmm1, 58(%rdi)
+; AVX512F-NEXT: LBB61_60: ## %else58
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_62
+; AVX512F-NEXT: ## BB#61: ## %cond.store59
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $6, %xmm1, 60(%rdi)
+; AVX512F-NEXT: LBB61_62: ## %else60
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_64
+; AVX512F-NEXT: ## BB#63: ## %cond.store61
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512F-NEXT: vpextrw $7, %xmm0, 62(%rdi)
+; AVX512F-NEXT: LBB61_64: ## %else62
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_mask_store_32xi16:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0