case X86ISD::GlobalBaseReg:
return getGlobalBaseReg();
+ case X86ISD::SHRUNKBLEND: {
+ // SHRUNKBLEND selects like a regular VSELECT.
+ SDValue VSelect = CurDAG->getNode(
+ ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2));
+ ReplaceUses(SDValue(Node, 0), VSelect);
+ SelectCode(VSelect.getNode());
+ // We already called ReplaceUses.
+ return nullptr;
+ }
case ISD::ATOMIC_LOAD_XOR:
case ISD::ATOMIC_LOAD_AND:
case X86ISD::ANDNP: return "X86ISD::ANDNP";
case X86ISD::PSIGN: return "X86ISD::PSIGN";
case X86ISD::BLENDI: return "X86ISD::BLENDI";
+ case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
case X86ISD::SUBUS: return "X86ISD::SUBUS";
case X86ISD::HADD: return "X86ISD::HADD";
case X86ISD::HSUB: return "X86ISD::HSUB";
// build_vector of constants. This will be taken care in a later
// condition.
(TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
- VT != MVT::v8i16)) {
+ VT != MVT::v8i16) &&
+ // Don't optimize vector of constants. Those are handled by
+ // the generic code and all the bits must be properly set for
+ // the generic optimizer.
+ !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
// Don't optimize vector selects that map to mask-registers.
if (BitWidth == 1)
return SDValue();
- // Check all uses of that condition operand to check whether it will be
- // consumed by non-BLEND instructions, which may depend on all bits are set
- // properly.
- for (SDNode::use_iterator I = Cond->use_begin(),
- E = Cond->use_end(); I != E; ++I)
- if (I->getOpcode() != ISD::VSELECT)
- // TODO: Add other opcodes eventually lowered into BLEND.
- return SDValue();
-
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
DCI.isBeforeLegalizeOps());
if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
- (TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
- TLO) &&
- // Don't optimize vector of constants. Those are handled by
- // the generic code and all the bits must be properly set for
- // the generic optimizer.
- !ISD::isBuildVectorOfConstantSDNodes(TLO.New.getNode())))
- DCI.CommitTargetLoweringOpt(TLO);
+ TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
+ TLO)) {
+ // If we changed the computation somewhere in the DAG, this change
+ // will affect all users of Cond.
+ // Make sure it is fine and update all the nodes so that we do not
+ // use the generic VSELECT anymore. Otherwise, we may perform
+ // wrong optimizations as we messed up with the actual expectation
+ // for the vector boolean values.
+ if (Cond != TLO.Old) {
+ // Check all uses of that condition operand to check whether it will be
+ // consumed by non-BLEND instructions, which may depend on all bits are
+ // set properly.
+ for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+ I != E; ++I)
+ if (I->getOpcode() != ISD::VSELECT)
+ // TODO: Add other opcodes eventually lowered into BLEND.
+ return SDValue();
+
+ // Update all the users of the condition, before committing the change,
+ // so that the VSELECT optimizations that expect the correct vector
+ // boolean value will not be triggered.
+ for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+ I != E; ++I)
+ DAG.ReplaceAllUsesOfValueWith(
+ SDValue(*I, 0),
+ DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
+ Cond, I->getOperand(1), I->getOperand(2)));
+ DCI.CommitTargetLoweringOpt(TLO);
+ return SDValue();
+ }
+ // At this point, only Cond is changed. Change the condition
+ // just for N to keep the opportunity to optimize all other
+ // users their own way.
+ DAG.ReplaceAllUsesOfValueWith(
+ SDValue(N, 0),
+ DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
+ TLO.New, N->getOperand(1), N->getOperand(2)));
+ return SDValue();
+ }
}
// We should generate an X86ISD::BLENDI from a vselect if its argument
// Iff we find this pattern and the build_vectors are built from
// constants, we translate the vselect into a shuffle_vector that we
// know will be matched by LowerVECTOR_SHUFFLEtoBlend.
- if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
+ if ((N->getOpcode() == ISD::VSELECT ||
+ N->getOpcode() == X86ISD::SHRUNKBLEND) &&
+ !DCI.isBeforeLegalize()) {
SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
if (Shuffle.getNode())
return Shuffle;
case ISD::EXTRACT_VECTOR_ELT:
return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
case ISD::VSELECT:
- case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+ case ISD::SELECT:
+ case X86ISD::SHRUNKBLEND:
+ return PerformSELECTCombine(N, DAG, DCI, Subtarget);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
/// BLENDI - Blend where the selector is an immediate.
BLENDI,
+ /// SHRUNKBLEND - Blend where the condition has been shrunk.
+ /// This is used to emphasize that the condition mask is
+ /// no more valid for generic VSELECT optimizations.
+ SHRUNKBLEND,
+
/// ADDSUB - Combined add and sub on an FP vector.
ADDSUB,
; SSE41-LABEL: vsel_double8:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1]
-; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm5[0,1]
; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1]
-; SSE41-NEXT: blendpd {{.*#+}} xmm3 = xmm7[0,1]
+; SSE41-NEXT: movaps %xmm5, %xmm1
+; SSE41-NEXT: movaps %xmm7, %xmm3
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_double8:
;
; SSE41-LABEL: vsel_i648:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1]
-; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm5[0,1]
-; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1]
-; SSE41-NEXT: blendpd {{.*#+}} xmm3 = xmm7[0,1]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT: movaps %xmm5, %xmm1
+; SSE41-NEXT: movaps %xmm7, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: vsel_i648:
store <4 x i16> %predphi42, <4 x i16>* %b, align 8
ret void
}
+
+; Improve code coverage.
+;
+; When shrinking the condition used into the select to match a blend, this
+; test case exercises the path where the modified node is not the root
+; of the condition.
+;
+; CHECK-LABEL: test2:
+; CHECK: vpslld $31, %xmm0, %xmm0
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1
+; CHECK-NEXT: vpshufd $78, %xmm0, %xmm0 ## xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]]
+; CHECK: vblendvpd [[MASK]]
+; CHECK: retq
+define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
+bb:
+ %arrayidx1928 = getelementptr inbounds double** %call1559, i64 %indvars.iv4198
+ %tmp1888 = load double** %arrayidx1928, align 8
+ %predphi.v.v = select <4 x i1> %tmp1895, <4 x double> <double -5.000000e-01, double -5.000000e-01, double -5.000000e-01, double -5.000000e-01>, <4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+ %tmp1900 = bitcast double* %tmp1888 to <4 x double>*
+ store <4 x double> %predphi.v.v, <4 x double>* %tmp1900, align 8
+ ret void
+}
+
+; For this test, we used to optimized the conditional mask for the blend, i.e.,
+; we shrunk some of its bits.
+; However, this same mask was used in another select (%predphi31) that turned out
+; to be optimized into a and. In that case, the conditional mask was wrong.
+;
+; Make sure that the and is fed by the original mask.
+;
+; <rdar://problem/18819506>
+
+; Note: For now, hard code ORIG_MASK and SHRUNK_MASK registers, because we
+; cannot express that ORIG_MASK must not be equal to ORIG_MASK. Otherwise,
+; even a faulty pattern would pass!
+;
+; CHECK-LABEL: test3:
+; Compute the original mask.
+; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[ORIG_MASK:%xmm0]]
+; Shrink the bit of the mask.
+; CHECK-NEXT: vpslld $31, [[ORIG_MASK]], [[SHRUNK_MASK:%xmm3]]
+; Use the shrunk mask in the blend.
+; CHECK-NEXT: vblendvps [[SHRUNK_MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+; Use the original mask in the and.
+; CHECK-NEXT: vpand LCPI2_2(%rip), [[ORIG_MASK]], {{%xmm[0-9]+}}
+; CHECK: retq
+define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) {
+ %tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>
+ %tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer
+ %predphi = select <4 x i1> %tmp7, <4 x i16> %tmp3, <4 x i16> %tmp12
+ %predphi31 = select <4 x i1> %tmp7, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer
+
+ store <4 x i16> %predphi31, <4 x i16>* %tmp16, align 8
+ store <4 x i16> %predphi, <4 x i16>* %tmp17, align 8
+ ret void
+}