[X86][AVX512] Only combine EVEX targets shuffles to shuffles of the same number of...

author Simon Pilgrim <llvm-dev@redking.me.uk>

Sun, 28 Aug 2016 17:27:14 +0000 (17:27 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Sun, 28 Aug 2016 17:27:14 +0000 (17:27 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Sun, 28 Aug 2016 17:27:14 +0000 (17:27 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Sun, 28 Aug 2016 17:27:14 +0000 (17:27 +0000)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp

index 896ce13..2591aef 100644 (file)
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -25287,6 +25287,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
    }
  
    unsigned RootSizeInBits = RootVT.getSizeInBits();
+  unsigned NumRootElts = RootVT.getVectorNumElements();
    unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
    bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
                       (RootVT.is256BitVector() && !Subtarget.hasAVX2());
@@ -25297,11 +25298,10 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
    // TODO - this currently prevents all lane shuffles from occurring.
    // TODO - check for writemasks usage instead of always preventing combining.
    // TODO - attempt to narrow Mask back to writemask size.
-  if (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits &&
-      (RootSizeInBits == 512 ||
-       (Subtarget.hasVLX() && RootSizeInBits >= 128))) {
+  bool IsEVEXShuffle =
+      RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
+  if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
      return false;
-  }
  
    // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
  
@@ -25370,6 +25370,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
      if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleVT)) {
        if (Depth == 1 && Root.getOpcode() == Shuffle)
          return false; // Nothing to do!
+      if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+        return false; // AVX512 Writemask clash.
        Res = DAG.getBitcast(ShuffleVT, V1);
        DCI.AddToWorklist(Res.getNode());
        Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
@@ -25383,6 +25385,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                                         ShuffleVT, PermuteImm)) {
        if (Depth == 1 && Root.getOpcode() == Shuffle)
          return false; // Nothing to do!
+      if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+        return false; // AVX512 Writemask clash.
        Res = DAG.getBitcast(ShuffleVT, V1);
        DCI.AddToWorklist(Res.getNode());
        Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
@@ -25398,6 +25402,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                                 ShuffleVT)) {
      if (Depth == 1 && Root.getOpcode() == Shuffle)
        return false; // Nothing to do!
+    if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+      return false; // AVX512 Writemask clash.
      V1 = DAG.getBitcast(ShuffleVT, V1);
      DCI.AddToWorklist(V1.getNode());
      V2 = DAG.getBitcast(ShuffleVT, V2);
@@ -25413,6 +25419,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                                        Shuffle, ShuffleVT, PermuteImm)) {
      if (Depth == 1 && Root.getOpcode() == Shuffle)
        return false; // Nothing to do!
+    if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+      return false; // AVX512 Writemask clash.
      V1 = DAG.getBitcast(ShuffleVT, V1);
      DCI.AddToWorklist(V1.getNode());
      V2 = DAG.getBitcast(ShuffleVT, V2);
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll

index 6fb698b..26db8fc 100644 (file)
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -3241,7 +3241,8 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
  ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
  ; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
  ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
  ; AVX512VL-NEXT:    addq $24, %rsp
  ; AVX512VL-NEXT:    popq %rbx
  ; AVX512VL-NEXT:    popq %r14
@@ -4148,7 +4149,8 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
  ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
  ; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
  ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
  ; AVX512VL-NEXT:    vmovdqa32 %xmm0, (%r14)
  ; AVX512VL-NEXT:    addq $16, %rsp
  ; AVX512VL-NEXT:    popq %rbx
@@ -5136,7 +5138,8 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
  ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
  ; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
  ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
  ; AVX512VL-NEXT:    addq $40, %rsp
  ; AVX512VL-NEXT:    popq %rbx
  ; AVX512VL-NEXT:    popq %r14
@@ -5939,7 +5942,8 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
  ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
  ; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
  ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
  ; AVX512VL-NEXT:    vmovdqa32 %xmm0, (%r14)
  ; AVX512VL-NEXT:    addq $32, %rsp
  ; AVX512VL-NEXT:    popq %rbx
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Sun, 28 Aug 2016 17:27:14 +0000 (17:27 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Sun, 28 Aug 2016 17:27:14 +0000 (17:27 +0000)
llvm/lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/X86/vector-half-conversions.ll		patch \| blob \| history