MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
+ MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
unsigned NumElems = Op.getNumOperands();
// Generate vectors for predicate vectors.
unsigned EVTBits = EltVT.getSizeInBits();
APInt UndefMask = APInt::getZero(NumElems);
+ APInt FrozenUndefMask = APInt::getZero(NumElems);
APInt ZeroMask = APInt::getZero(NumElems);
APInt NonZeroMask = APInt::getZero(NumElems);
bool IsAllConstants = true;
UndefMask.setBit(i);
continue;
}
+ if (Elt.getOpcode() == ISD::FREEZE && Elt.getOperand(0).isUndef()) {
+ FrozenUndefMask.setBit(i);
+ continue;
+ }
Values.insert(Elt);
if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
IsAllConstants = false;
}
// All undef vector. Return an UNDEF. All zero vectors were handled above.
- if (NonZeroMask == 0) {
+ unsigned NumFrozenUndefElts = FrozenUndefMask.countPopulation();
+ if (NonZeroMask == 0 && NumFrozenUndefElts != NumElems) {
assert(UndefMask.isAllOnes() && "Fully undef mask expected");
return DAG.getUNDEF(VT);
}
+ // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
+ // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
+ // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
+ // and blend the FREEZE-UNDEF operands back in.
+ // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
+ if (NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
+ SmallVector<int, 16> BlendMask(NumElems, -1);
+ SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
+ for (unsigned i = 0; i < NumElems; ++i) {
+ if (UndefMask[i]) {
+ BlendMask[i] = -1;
+ continue;
+ }
+ BlendMask[i] = i;
+ if (!FrozenUndefMask[i])
+ Elts[i] = Op.getOperand(i);
+ else
+ BlendMask[i] += NumElems;
+ }
+ SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
+ SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
+ SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
+ return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
+ }
+
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
// If the upper elts of a ymm/zmm are undef/zero then we might be better off
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl (%edx), %edx
; X86-NEXT: andl $15, %edx
-; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm1
-; X86-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
-; X86-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
-; X86-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7]
-; X86-NEXT: vpand %xmm2, %xmm1, %xmm1
+; X86-NEXT: vmovd %edx, %xmm0
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; X86-NEXT: vmovd %eax, %xmm2
+; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
+; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X86-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
+; X86-NEXT: vpand %xmm3, %xmm1, %xmm1
; X86-NEXT: vmovdqa %xmm1, (%ecx)
-; X86-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; X86-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
-; X86-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; X86-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
+; X86-NEXT: vpand %xmm3, %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: andl $15, %eax
; X64-NEXT: vmovd %eax, %xmm0
-; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1
-; X64-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
-; X64-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
-; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
-; X64-NEXT: vpand %xmm2, %xmm1, %xmm1
-; X64-NEXT: vmovdqa %xmm1, (%rdx)
-; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; X64-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
-; X64-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; X64-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X64-NEXT: vpbroadcastd %xmm0, %xmm0
+; X64-NEXT: vmovd %eax, %xmm1
+; X64-NEXT: vpbroadcastd %xmm1, %xmm1
+; X64-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3]
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7]
+; X64-NEXT: vpand %xmm3, %xmm2, %xmm2
+; X64-NEXT: vmovdqa %xmm2, (%rdx)
+; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
+; X64-NEXT: vpand %xmm3, %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rcx)
; X64-NEXT: retq
%i0.src = load i32, ptr %origin0
; X86-NEXT: movl (%edx), %edx
; X86-NEXT: andl $15, %edx
; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
-; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
-; X86-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; X86-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7]
-; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-NEXT: vmovd %edx, %xmm1
+; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
+; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
+; X86-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7]
+; X86-NEXT: vpand %xmm2, %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%ecx)
-; X86-NEXT: vmovd %edx, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; X86-NEXT: vpand %xmm2, %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: andl $15, %eax
; X64-NEXT: vmovd %eax, %xmm0
-; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; X64-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
-; X64-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
-; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
-; X64-NEXT: vmovdqa %xmm0, (%rdx)
-; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: vpbroadcastd %xmm0, %xmm0
-; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-NEXT: vmovd %eax, %xmm1
+; X64-NEXT: vpbroadcastd %xmm1, %xmm1
+; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
+; X64-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X64-NEXT: vmovdqa %xmm0, (%rdx)
+; X64-NEXT: vpand %xmm2, %xmm1, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rcx)
; X64-NEXT: retq
%i0.src = load i32, ptr %origin0
; X86-NEXT: movl (%edx), %edx
; X86-NEXT: andl $15, %edx
; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; X86-NEXT: vmovd %edx, %xmm1
; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X86-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
; X86-NEXT: vmovddup {{.*#+}} xmm2 = [7,7]
; X86-NEXT: # xmm2 = mem[0,0]
; X86-NEXT: vpand %xmm2, %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%ecx)
-; X86-NEXT: vpand %xmm2, %xmm1, %xmm0
+; X86-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X86-NEXT: vpand %xmm2, %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: freeze_two_buildvectors_one_undef_elt:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: andl $15, %eax
-; X64-NEXT: vmovq %rax, %xmm0
-; X64-NEXT: vmovq %rax, %xmm1
-; X64-NEXT: vpbroadcastq %xmm1, %xmm1
-; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7]
-; X64-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: vpbroadcastd %xmm0, %xmm0
+; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdx)
-; X64-NEXT: vpand %xmm2, %xmm1, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rcx)
; X64-NEXT: retq
%i0.src = load i64, ptr %origin0