assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
- unsigned ZMask = 0;
- int V1DstIndex = -1;
- int V2DstIndex = -1;
- bool V1UsedInPlace = false;
- for (int i = 0; i < 4; ++i) {
- // Synthesize a zero mask from the zeroable elements (includes undefs).
- if (Zeroable[i]) {
- ZMask |= 1 << i;
- continue;
- }
+ // Attempt to match INSERTPS with one element from VA or VB being
+ // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
+ // are updated.
+ auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
+ ArrayRef<int> CandidateMask) {
+ unsigned ZMask = 0;
+ int VADstIndex = -1;
+ int VBDstIndex = -1;
+ bool VAUsedInPlace = false;
+
+ for (int i = 0; i < 4; ++i) {
+ // Synthesize a zero mask from the zeroable elements (includes undefs).
+ if (Zeroable[i]) {
+ ZMask |= 1 << i;
+ continue;
+ }
- // Flag if we use any V1 inputs in place.
- if (i == Mask[i]) {
- V1UsedInPlace = true;
- continue;
+ // Flag if we use any VA inputs in place.
+ if (i == CandidateMask[i]) {
+ VAUsedInPlace = true;
+ continue;
+ }
+
+ // We can only insert a single non-zeroable element.
+ if (VADstIndex >= 0 || VBDstIndex >= 0)
+ return false;
+
+ if (CandidateMask[i] < 4) {
+ // VA input out of place for insertion.
+ VADstIndex = i;
+ } else {
+ // VB input for insertion.
+ VBDstIndex = i;
+ }
}
- // We can only insert a single non-zeroable element.
- if (V1DstIndex >= 0 || V2DstIndex >= 0)
+ // Don't bother if we have no (non-zeroable) element for insertion.
+ if (VADstIndex < 0 && VBDstIndex < 0)
return false;
- if (Mask[i] < 4) {
- // V1 input out of place for insertion.
- V1DstIndex = i;
+ // Determine element insertion src/dst indices. The src index is from the
+ // start of the inserted vector, not the start of the concatenated vector.
+ unsigned VBSrcIndex = 0;
+ if (VADstIndex >= 0) {
+ // If we have a VA input out of place, we use VA as the V2 element
+ // insertion and don't use the original V2 at all.
+ VBSrcIndex = CandidateMask[VADstIndex];
+ VBDstIndex = VADstIndex;
+ VB = VA;
} else {
- // V2 input for insertion.
- V2DstIndex = i;
+ VBSrcIndex = CandidateMask[VBDstIndex] - 4;
}
- }
- // Don't bother if we have no (non-zeroable) element for insertion.
- if (V1DstIndex < 0 && V2DstIndex < 0)
- return false;
+ // If no V1 inputs are used in place, then the result is created only from
+ // the zero mask and the V2 insertion - so remove V1 dependency.
+ if (!VAUsedInPlace)
+ VA = DAG.getUNDEF(MVT::v4f32);
- // Determine element insertion src/dst indices. The src index is from the
- // start of the inserted vector, not the start of the concatenated vector.
- unsigned V2SrcIndex = 0;
- if (V1DstIndex >= 0) {
- // If we have a V1 input out of place, we use V1 as the V2 element insertion
- // and don't use the original V2 at all.
- V2SrcIndex = Mask[V1DstIndex];
- V2DstIndex = V1DstIndex;
- V2 = V1;
- } else {
- V2SrcIndex = Mask[V2DstIndex] - 4;
- }
+ // Update V1, V2 and InsertPSMask accordingly.
+ V1 = VA;
+ V2 = VB;
- // If no V1 inputs are used in place, then the result is created only from
- // the zero mask and the V2 insertion - so remove V1 dependency.
- if (!V1UsedInPlace)
- V1 = DAG.getUNDEF(MVT::v4f32);
+ // Insert the V2 element into the desired position.
+ InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ return true;
+ };
- // Insert the V2 element into the desired position.
- InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
- assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
- return true;
+ if (matchAsInsertPS(V1, V2, Mask))
+ return true;
+
+ // Commute and try again.
+ SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
+ ShuffleVectorSDNode::commuteMask(CommutedMask);
+ if (matchAsInsertPS(V2, V1, CommutedMask))
+ return true;
+
+ return false;
}
static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
define <4 x float> @shuffle_v4f32_z06z(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_z06z:
; SSE: # BB#0:
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,3]
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; SSE-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_z06z:
; AVX: # BB#0:
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,3]
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],xmm1[2],zero
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 6, i32 undef>
%shuffle1 = shufflevector <4 x float> %shuffle, <4 x float> <float 0.000000e+00, float undef, float undef, float 0.000000e+00>, <4 x i32> <i32 4, i32 1, i32 2, i32 7>