; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpsubw %ymm2, %ymm5, %ymm2
-; AVX512F-NEXT: vpsubw %ymm1, %ymm5, %ymm5
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1
-; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_reg:
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm5, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2
-; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i16_signed_reg_reg:
; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm4
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT: vpternlogq $216, %zmm5, %zmm1, %zmm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1
-; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_unsigned_reg_reg:
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm4, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2
-; AVX512VL-FALLBACK-NEXT: vpternlogq $216, %zmm5, %zmm1, %zmm2
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i16_unsigned_reg_reg:
; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpsubw %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT: vpsubw %ymm0, %ymm5, %ymm5
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
-; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0
-; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_reg:
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm5, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
-; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i16_signed_mem_reg:
; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpsubw %ymm2, %ymm5, %ymm2
-; AVX512F-NEXT: vpsubw %ymm1, %ymm5, %ymm5
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1
-; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem:
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm5, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2
-; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i16_signed_reg_mem:
; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpsubw %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT: vpsubw %ymm0, %ymm5, %ymm5
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
-; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0
-; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_mem:
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm5, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
-; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i16_signed_mem_mem: