From 6b6c62c75b8b5047a0def2dedcadd367a88e7002 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Thu, 16 Feb 2023 11:57:31 -0600 Subject: [PATCH] Widen i16 shuffle masks if vector width < 512 even with BWI `{v}blend{d|ps|pd}` is preferable to `{v}blendw` so widen so that we can match it. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D143789 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 7 ++++--- llvm/test/CodeGen/X86/combine-sdiv.ll | 26 ++++++-------------------- llvm/test/CodeGen/X86/shuffle-blendw.ll | 8 ++++---- 3 files changed, 14 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6188d1d..7ff686c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19839,9 +19839,10 @@ static bool canCombineAsMaskOperation(SDValue V1, SDValue V2, if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI()) return false; - // i8 is better to be widen to i16, because there is PBLENDW for vXi16 - // when the vector bit size is 128 or 256. - if (VT == MVT::i8 && V1.getSimpleValueType().getSizeInBits() < 512) + // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd + // are preferable to blendw/blendvb/masked-mov. + if ((VT == MVT::i16 || VT == MVT::i8) && + V1.getSimpleValueType().getSizeInBits() < 512) return false; auto HasMaskOperation = [&](SDValue V) { diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 3342ecc..d9a6b6d 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2889,26 +2889,12 @@ define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: combine_vec_sdiv_nonuniform7: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: combine_vec_sdiv_nonuniform7: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: combine_vec_sdiv_nonuniform7: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX512BW-NEXT: retq +; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7: +; AVX2ORLATER: # %bb.0: +; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2ORLATER-NEXT: retq ; ; XOP-LABEL: combine_vec_sdiv_nonuniform7: ; XOP: # %bb.0: diff --git a/llvm/test/CodeGen/X86/shuffle-blendw.ll b/llvm/test/CodeGen/X86/shuffle-blendw.ll index b5ce5ee..9f90657 100644 --- a/llvm/test/CodeGen/X86/shuffle-blendw.ll +++ b/llvm/test/CodeGen/X86/shuffle-blendw.ll @@ -66,13 +66,13 @@ define <16 x i16> @blendw_to_blendd_32(<16 x i16> %x, <16 x i16> %y, <16 x i16> ; X86-AVX512-LABEL: blendw_to_blendd_32: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; X86-AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7],ymm0[8,9],ymm1[10,11],ymm0[12,13],ymm1[14,15] +; X86-AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; X86-AVX512-NEXT: retl ; ; X64-AVX512-LABEL: blendw_to_blendd_32: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7],ymm0[8,9],ymm1[10,11],ymm0[12,13],ymm1[14,15] +; X64-AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; X64-AVX512-NEXT: retq %x1 = add <16 x i16> %x, %z %shuffle = shufflevector <16 x i16> %x1, <16 x i16> %y, <16 x i32> @@ -119,13 +119,13 @@ define <8 x i16> @blendw_to_blendd_16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %z) ; X86-AVX512-LABEL: blendw_to_blendd_16: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; X86-AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; X86-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X86-AVX512-NEXT: retl ; ; X64-AVX512-LABEL: blendw_to_blendd_16: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; X64-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X64-AVX512-NEXT: retq %x1 = add <8 x i16> %x, %z %shuffle = shufflevector <8 x i16> %x1, <8 x i16> %y, <8 x i32> -- 2.7.4