// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
// perform INSERTPS if a single V1 element is out of place and all V2
// elements are zeroable.
-static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
+ unsigned &InsertPSMask,
+ const SmallBitVector &Zeroable,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
-
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
unsigned ZMask = 0;
int V1DstIndex = -1;
int V2DstIndex = -1;
// We can only insert a single non-zeroable element.
if (V1DstIndex >= 0 || V2DstIndex >= 0)
- return SDValue();
+ return false;
if (Mask[i] < 4) {
// V1 input out of place for insertion.
// Don't bother if we have no (non-zeroable) element for insertion.
if (V1DstIndex < 0 && V2DstIndex < 0)
- return SDValue();
+ return false;
// Determine element insertion src/dst indices. The src index is from the
// start of the inserted vector, not the start of the concatenated vector.
if (!V1UsedInPlace)
V1 = DAG.getUNDEF(MVT::v4f32);
- unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
+ // Insert the V2 element into the desired position.
+ InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ return true;
+}
+
+static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ // Attempt to match the insertps pattern.
+ unsigned InsertPSMask;
+ if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
+ return SDValue();
// Insert the V2 element into the desired position.
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
}
}
+ // Attempt to combine to INSERTPS.
+ if (Subtarget.hasSSE41() && NumMaskElts == 4 &&
+ (VT == MVT::v2f64 || VT == MVT::v4f32)) {
+ SmallBitVector Zeroable(4, false);
+ for (unsigned i = 0; i != NumMaskElts; ++i)
+ if (Mask[i] < 0)
+ Zeroable[i] = true;
+
+ unsigned InsertPSMask;
+ SDValue V1 = Input, V2 = Input;
+ if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask,
+ Zeroable, Mask, DAG)) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS)
+ return false; // Nothing to do!
+ V1 = DAG.getBitcast(MVT::v4f32, V1);
+ DCI.AddToWorklist(V1.getNode());
+ V2 = DAG.getBitcast(MVT::v4f32, V2);
+ DCI.AddToWorklist(V2.getNode());
+ Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+ }
+
// Don't try to re-form single instruction chains under any circumstances now
// that we've done encoding canonicalization for them.
if (Depth < 2)
define <4 x float> @shuffle_v4f32_0zz0(float %a) {
; SSE-LABEL: shuffle_v4f32_0zz0:
; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,0]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0zz0:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
; AVX-NEXT: retq
%vecinit = insertelement <4 x float> undef, float %a, i32 0
%vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
; ALL-LABEL: combine_vpermilvar_4f32_as_insertps:
; ALL: # BB#0:
-; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero
; ALL-NEXT: retq
%1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
%2 = shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 1, i32 4>