From 45aa1b88534c74246773aab6dd33c3568bb25d24 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 12 May 2020 12:31:07 +0100 Subject: [PATCH] [X86][AVX] Use X86ISD::VPERM2X128 for blend-with-zero if optimizing for size Last part of PR22984 - avoid the zero-register dependency if optimizing for size --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +++- llvm/test/CodeGen/X86/avx-vperm2x128.ll | 10 ++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 09a4f70..42723a0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34062,6 +34062,7 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return DAG.getBitcast(RootVT, V1); } + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); unsigned RootSizeInBits = RootVT.getSizeInBits(); unsigned NumRootElts = RootVT.getVectorNumElements(); unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; @@ -34205,9 +34206,10 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless // we need to use the zeroing feature. + // Prefer blends for sequential shuffles unless we are optimizing for size. if (UnaryShuffle && !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) && - !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) { + (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) { unsigned PermMask = 0; PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0); PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4); diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll index 6344506..27edb31 100644 --- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll +++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll @@ -357,8 +357,8 @@ entry: ;; Test zero mask generation. ;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984 -;; Prefer xor+vblendpd over vperm2f128 because that has better performance. -;; TODO: When building for optsize we should use vperm2f128. +;; Prefer xor+vblendpd over vperm2f128 because that has better performance, +;; unless building for optsize where we should still use vperm2f128. define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) { ; ALL-LABEL: shuffle_v4f64_zz01: @@ -389,8 +389,7 @@ define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) { define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize { ; ALL-LABEL: shuffle_v4f64_zz23_optsize: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] ; ALL-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> ret <4 x double> %s @@ -425,8 +424,7 @@ define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) { define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize { ; ALL-LABEL: shuffle_v4f64_zz67_optsize: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] ; ALL-NEXT: retq %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> ret <4 x double> %s -- 2.7.4