return false;
}
+static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
+ SmallVector<int, 32> WidenedMask;
+ return scaleShuffleElements(Mask, NumDstElts, WidenedMask);
+}
+
/// Returns true if Elt is a constant zero or a floating point constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
return isNullConstant(Elt) || isNullFPConstant(Elt);
RMask.push_back(i);
}
- // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
- if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
- (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) ||
- isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
- return false;
-
// If A and B occur in reverse order in RHS, then canonicalize by commuting
// RHS operands and shuffle mask.
if (A != C) {
if (IsIdentityPostShuffle)
PostShuffleMask.clear();
+ // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split), unless
+ // the shuffle can widen to shuffle entire lanes, which should still be quick.
+ if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
+ isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(),
+ PostShuffleMask) &&
+ !canScaleShuffleElements(PostShuffleMask, 2))
+ return false;
+
// Assume a SingleSource HOP if we only shuffle one input and don't need to
// shuffle the result.
if (!shouldUseHorizontalOp(LHS == RHS &&
;
; AVX1-LABEL: hadd_reverse_v8f64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm2
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX1-NEXT: vhaddpd %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovapd %ymm2, %ymm1
+; AVX1-NEXT: vhaddpd %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
+; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vmovapd %ymm3, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: hadd_reverse_v8f64: