[X86][AVX] Use X86ISD::VPERM2X128 for blend-with-zero if optimizing for size

author Simon Pilgrim <llvm-dev@redking.me.uk>

Tue, 12 May 2020 11:31:07 +0000 (12:31 +0100)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Tue, 12 May 2020 12:03:50 +0000 (13:03 +0100)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Tue, 12 May 2020 11:31:07 +0000 (12:31 +0100)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Tue, 12 May 2020 12:03:50 +0000 (13:03 +0100)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp

index 09a4f70..42723a0 100644 (file)
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34062,6 +34062,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
      return DAG.getBitcast(RootVT, V1);
    }
  
+  bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
    unsigned RootSizeInBits = RootVT.getSizeInBits();
    unsigned NumRootElts = RootVT.getVectorNumElements();
    unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
@@ -34205,9 +34206,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
  
      // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
      // we need to use the zeroing feature.
+    // Prefer blends for sequential shuffles unless we are optimizing for size.
      if (UnaryShuffle &&
          !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
-        !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
+        (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
        unsigned PermMask = 0;
        PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
        PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll

index 6344506..27edb31 100644 (file)
--- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll
@@ -357,8 +357,8 @@ entry:
  
  ;; Test zero mask generation.
  ;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
-;; Prefer xor+vblendpd over vperm2f128 because that has better performance.
-;; TODO: When building for optsize we should use vperm2f128.
+;; Prefer xor+vblendpd over vperm2f128 because that has better performance,
+;; unless building for optsize where we should still use vperm2f128.
  
  define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) {
  ; ALL-LABEL: shuffle_v4f64_zz01:
@@ -389,8 +389,7 @@ define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) {
  define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize {
  ; ALL-LABEL: shuffle_v4f64_zz23_optsize:
  ; ALL:       # %bb.0:
-; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3]
  ; ALL-NEXT:    retq
    %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
    ret <4 x double> %s
@@ -425,8 +424,7 @@ define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) {
  define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize {
  ; ALL-LABEL: shuffle_v4f64_zz67_optsize:
  ; ALL:       # %bb.0:
-; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3]
  ; ALL-NEXT:    retq
    %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
    ret <4 x double> %s
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Tue, 12 May 2020 11:31:07 +0000 (12:31 +0100)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Tue, 12 May 2020 12:03:50 +0000 (13:03 +0100)
llvm/lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/X86/avx-vperm2x128.ll		patch \| blob \| history