[X86][SSE] Add support for target shuffle combining to INSERTPS

author Simon Pilgrim <llvm-dev@redking.me.uk>

Sat, 9 Jul 2016 21:47:55 +0000 (21:47 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Sat, 9 Jul 2016 21:47:55 +0000 (21:47 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Sat, 9 Jul 2016 21:47:55 +0000 (21:47 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Sat, 9 Jul 2016 21:47:55 +0000 (21:47 +0000)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp

index 4e3a85a..83e910d 100644 (file)
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8677,16 +8677,14 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
  // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
  // perform INSERTPS if a single V1 element is out of place and all V2
  // elements are zeroable.
-static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL,
-                                            SDValue V1, SDValue V2,
-                                            ArrayRef<int> Mask,
-                                            SelectionDAG &DAG) {
+static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
+                                         unsigned &InsertPSMask,
+                                         const SmallBitVector &Zeroable,
+                                         ArrayRef<int> Mask,
+                                         SelectionDAG &DAG) {
    assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
    assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
    assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
-
-  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
    unsigned ZMask = 0;
    int V1DstIndex = -1;
    int V2DstIndex = -1;
@@ -8707,7 +8705,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL,
  
      // We can only insert a single non-zeroable element.
      if (V1DstIndex >= 0 || V2DstIndex >= 0)
-      return SDValue();
+      return false;
  
      if (Mask[i] < 4) {
        // V1 input out of place for insertion.
@@ -8720,7 +8718,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL,
  
    // Don't bother if we have no (non-zeroable) element for insertion.
    if (V1DstIndex < 0 && V2DstIndex < 0)
-    return SDValue();
+    return false;
  
    // Determine element insertion src/dst indices. The src index is from the
    // start of the inserted vector, not the start of the concatenated vector.
@@ -8740,8 +8738,21 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL,
    if (!V1UsedInPlace)
      V1 = DAG.getUNDEF(MVT::v4f32);
  
-  unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
+  // Insert the V2 element into the desired position.
+  InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
    assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+  return true;
+}
+
+static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
+                                            SDValue V2, ArrayRef<int> Mask,
+                                            SelectionDAG &DAG) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+  // Attempt to match the insertps pattern.
+  unsigned InsertPSMask;
+  if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
+    return SDValue();
  
    // Insert the V2 element into the desired position.
    return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
@@ -25081,6 +25092,33 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
      }
    }
  
+  // Attempt to combine to INSERTPS.
+  if (Subtarget.hasSSE41() && NumMaskElts == 4 &&
+      (VT == MVT::v2f64 || VT == MVT::v4f32)) {
+    SmallBitVector Zeroable(4, false);
+    for (unsigned i = 0; i != NumMaskElts; ++i)
+      if (Mask[i] < 0)
+        Zeroable[i] = true;
+
+    unsigned InsertPSMask;
+    SDValue V1 = Input, V2 = Input;
+    if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask,
+                                                       Zeroable, Mask, DAG)) {
+      if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS)
+        return false; // Nothing to do!
+      V1 = DAG.getBitcast(MVT::v4f32, V1);
+      DCI.AddToWorklist(V1.getNode());
+      V2 = DAG.getBitcast(MVT::v4f32, V2);
+      DCI.AddToWorklist(V2.getNode());
+      Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+                        DAG.getConstant(InsertPSMask, DL, MVT::i8));
+      DCI.AddToWorklist(Res.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                    /*AddTo*/ true);
+      return true;
+    }
+  }
+
    // Don't try to re-form single instruction chains under any circumstances now
    // that we've done encoding canonicalization for them.
    if (Depth < 2)
diff --git a/llvm/test/CodeGen/X86/insertps-combine.ll b/llvm/test/CodeGen/X86/insertps-combine.ll

index 01d859e..b21fdec 100644 (file)
--- a/llvm/test/CodeGen/X86/insertps-combine.ll
+++ b/llvm/test/CodeGen/X86/insertps-combine.ll
@@ -60,17 +60,12 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {
  define <4 x float> @shuffle_v4f32_0zz0(float %a) {
  ; SSE-LABEL: shuffle_v4f32_0zz0:
  ; SSE:       # BB#0:
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,1,0]
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: shuffle_v4f32_0zz0:
  ; AVX:       # BB#0:
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
  ; AVX-NEXT:    retq
    %vecinit = insertelement <4 x float> undef, float %a, i32 0
    %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll

index ffae771..ea6535d 100644 (file)
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -190,7 +190,7 @@ define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) {
  define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
  ; ALL-LABEL: combine_vpermilvar_4f32_as_insertps:
  ; ALL:       # BB#0:
-; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero
  ; ALL-NEXT:    retq
    %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
    %2 = shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 1, i32 4>
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Sat, 9 Jul 2016 21:47:55 +0000 (21:47 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Sat, 9 Jul 2016 21:47:55 +0000 (21:47 +0000)
llvm/lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/X86/insertps-combine.ll		patch \| blob \| history
llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll		patch \| blob \| history