SelectionDAG &DAG) {
MVT ExtVT = VT;
MVT EltVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltBits = VT.getScalarSizeInBits();
if (isSoftFP16(EltVT, Subtarget))
return SDValue();
int V2Index =
find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
Mask.begin();
+ bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
bool IsV1Zeroable = true;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
if (i != V2Index && !Zeroable[i]) {
break;
}
+ // Bail if a non-zero V1 isn't used in place.
+ if (!IsV1Zeroable) {
+ SmallVector<int, 8> V1Mask(Mask);
+ V1Mask[V2Index] = -1;
+ if (!isNoopShuffleMask(V1Mask))
+ return SDValue();
+ }
+
// Check for a single input from a SCALAR_TO_VECTOR node.
// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
// all the smarts here sunk into that routine. However, the current
V2S = DAG.getBitcast(EltVT, V2S);
if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
// Using zext to expand a narrow element won't work for non-zero
- // insertions.
- if (!IsV1Zeroable)
+ // insertions. But we can use a masked constant vector if we're
+ // inserting V2 into the bottom of V1.
+ if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
return SDValue();
// Zero-extend directly to i32.
ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+
+ // If we're inserting into a constant, mask off the inserted index
+ // and OR with the zero-extended scalar.
+ if (!IsV1Zeroable) {
+ SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
+ Bits[V2Index] = APInt::getZero(EltBits);
+ SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
+ V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
+ V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
+ V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
+ return DAG.getNode(ISD::OR, DL, VT, V1, V2);
+ }
}
V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
if (!IsV1Zeroable) {
// If V1 can't be treated as a zero vector we have fewer options to lower
- // this. We can't support integer vectors or non-zero targets cheaply, and
- // the V1 elements can't be permuted in any way.
+ // this. We can't support integer vectors or non-zero targets cheaply.
assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
if (!VT.isFloatingPoint() || V2Index != 0)
return SDValue();
- SmallVector<int, 8> V1Mask(Mask);
- V1Mask[V2Index] = -1;
- if (!isNoopShuffleMask(V1Mask))
- return SDValue();
if (!VT.is128BitVector())
return SDValue();
// the desired position. Otherwise it is more efficient to do a vector
// shift left. We know that we can do a vector shift left because all
// the inputs are zero.
- if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
+ if (VT.isFloatingPoint() || NumElts <= 4) {
SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
V2Shuffle[V2Index] = 0;
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
} else {
V2 = DAG.getBitcast(MVT::v16i8, V2);
- V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
- DAG.getTargetConstant(
- V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
+ V2 = DAG.getNode(
+ X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
+ DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
V2 = DAG.getBitcast(VT, V2);
}
}
define <16 x i8> @elt0_v16i8(i8 %x) {
; X86-SSE2-LABEL: elt0_v16i8:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movd %eax, %xmm0
+; X86-SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: elt0_v16i8:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movd %edi, %xmm0
-; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: movzbl %dil, %eax
+; X64-SSE2-NEXT: movd %eax, %xmm0
; X64-SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE2-NEXT: retq
;