static bool canWidenShuffleElements(ArrayRef<int> Mask,
const APInt &Zeroable,
+ bool V2IsZero,
SmallVectorImpl<int> &WidenedMask) {
- SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
- for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
- if (TargetMask[i] == SM_SentinelUndef)
- continue;
- if (Zeroable[i])
- TargetMask[i] = SM_SentinelZero;
+ // Create an alternative mask with info about zeroable elements.
+ // Here we do not set undef elements as zeroable.
+ SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
+ if (V2IsZero) {
+ assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
+ for (int i = 0, Size = Mask.size(); i != Size; ++i)
+ if (Mask[i] != SM_SentinelUndef && Zeroable[i])
+ ZeroableMask[i] = SM_SentinelZero;
}
- return canWidenShuffleElements(TargetMask, WidenedMask);
+ return canWidenShuffleElements(ZeroableMask, WidenedMask);
}
static bool canWidenShuffleElements(ArrayRef<int> Mask) {
if (Subtarget.hasAVX2() && V2.isUndef())
return SDValue();
+ bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
+
SmallVector<int, 4> WidenedMask;
- if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
+ if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
return SDValue();
bool IsLowZero = (Zeroable & 0x3) == 0x3;
bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
- // Create an alternative mask with info about zeroable elements.
- // Here we do not set undef elements as zeroable.
- SmallVector<int, 64> ZeroableMask(OrigMask.begin(), OrigMask.end());
- if (V2IsZero) {
- assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
- for (int i = 0; i != NumElements; ++i)
- if (OrigMask[i] != SM_SentinelUndef && Zeroable[i])
- ZeroableMask[i] = SM_SentinelZero;
- }
-
// Try to collapse shuffles into using a vector type with fewer elements but
// wider element types. We cap this to not form integers or floating point
// elements wider than 64 bits, but it might be interesting to form i128
// integers to handle flipping the low and high halves of AVX 256-bit vectors.
SmallVector<int, 16> WidenedMask;
if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
- canWidenShuffleElements(ZeroableMask, WidenedMask)) {
+ canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
// Shuffle mask widening should not interfere with a broadcast opportunity
// by obfuscating the operands with bitcasts.
// TODO: Avoid lowering directly from this top-level function: make this
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
+
+@v2_0 = global <2 x i32> zeroinitializer, align 8
+
+define void @test() {
+; CHECK-LABEL: test:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-32, %rsp
+; CHECK-NEXT: subq $64, %rsp
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[0,0],ymm1[6,4],ymm0[4,4]
+; CHECK-NEXT: vmovaps %ymm0, (%rsp)
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %v8_0 = alloca <8 x i32>, align 32
+ %v8_0.0.v8_0.0..sroa_cast = bitcast <8 x i32>* %v8_0 to i8*
+ %0 = load <2 x i32>, <2 x i32>* @v2_0, align 8
+ %shuffle = shufflevector <2 x i32> %0, <2 x i32> <i32 -1, i32 -1>, <8 x i32> <i32 1, i32 3, i32 0, i32 0, i32 3, i32 3, i32 2, i32 2>
+ store volatile <8 x i32> %shuffle, <8 x i32>* %v8_0, align 32
+ ret void
+}