SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
SDValue CombineExtLoad(SDNode *N);
SDValue combineRepeatedFPDivisors(SDNode *N);
+ SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
SDValue BuildSDIV(SDNode *N);
SDValue BuildSDIVPow2(SDNode *N);
return St1;
}
+/// Convert a disguised subvector insertion into a shuffle:
+/// insert_vector_elt V, (bitcast X from vector type), IdxC -->
+/// bitcast(shuffle (bitcast V), (extended X), Mask)
+/// Note: We do not use an insert_subvector node because that requires a legal
+/// subvector type.
+SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
+ SDValue InsertVal = N->getOperand(1);
+ if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() ||
+ !InsertVal.getOperand(0).getValueType().isVector())
+ return SDValue();
+
+ SDValue SubVec = InsertVal.getOperand(0);
+ SDValue DestVec = N->getOperand(0);
+ EVT SubVecVT = SubVec.getValueType();
+ EVT VT = DestVec.getValueType();
+ unsigned NumSrcElts = SubVecVT.getVectorNumElements();
+ unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
+ unsigned NumMaskVals = ExtendRatio * NumSrcElts;
+
+ // Step 1: Create a shuffle mask that implements this insert operation. The
+ // vector that we are inserting into will be operand 0 of the shuffle, so
+ // those elements are just 'i'. The inserted subvector is in the first
+ // positions of operand 1 of the shuffle. Example:
+ // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}
+ SmallVector<int, 16> Mask(NumMaskVals);
+ for (unsigned i = 0; i != NumMaskVals; ++i) {
+ if (i / NumSrcElts == InsIndex)
+ Mask[i] = (i % NumSrcElts) + NumMaskVals;
+ else
+ Mask[i] = i;
+ }
+
+ // Bail out if the target can not handle the shuffle we want to create.
+ EVT SubVecEltVT = SubVecVT.getVectorElementType();
+ EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals);
+ if (!TLI.isShuffleMaskLegal(Mask, ShufVT))
+ return SDValue();
+
+ // Step 2: Create a wide vector from the inserted source vector by appending
+ // undefined elements. This is the same size as our destination vector.
+ SDLoc DL(N);
+ SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT));
+ ConcatOps[0] = SubVec;
+ SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps);
+
+ // Step 3: Shuffle in the padded subvector.
+ SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec);
+ SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask);
+ AddToWorklist(PaddedSubV.getNode());
+ AddToWorklist(DestVecBC.getNode());
+ AddToWorklist(Shuf.getNode());
+ return DAG.getBitcast(VT, Shuf);
+}
+
SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
SDValue InVec = N->getOperand(0);
SDValue InVal = N->getOperand(1);
InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
return InVec;
- // Check that we know which element is being inserted
- if (!isa<ConstantSDNode>(EltNo))
+ // We must know which element is being inserted for folds below here.
+ auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
+ if (!IndexC)
return SDValue();
- unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+ unsigned Elt = IndexC->getZExtValue();
+
+ if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
+ return Shuf;
// Canonicalize insert_vector_elt dag nodes.
// Example:
define <8 x float> @insert_subvector_256(i16 %x0, i16 %x1, <8 x float> %v) nounwind {
; X32_AVX256-LABEL: insert_subvector_256:
; X32_AVX256: # BB#0:
-; X32_AVX256-NEXT: pushl %eax
; X32_AVX256-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32_AVX256-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; X32_AVX256-NEXT: vmovd %xmm1, (%esp)
-; X32_AVX256-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3]
-; X32_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; X32_AVX256-NEXT: popl %eax
+; X32_AVX256-NEXT: vpbroadcastd %xmm1, %xmm1
+; X32_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
; X32_AVX256-NEXT: retl
;
; X64_AVX256-LABEL: insert_subvector_256:
; X64_AVX256: # BB#0:
; X64_AVX256-NEXT: vmovd %edi, %xmm1
; X64_AVX256-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
-; X64_AVX256-NEXT: vmovd %xmm1, -{{[0-9]+}}(%rsp)
-; X64_AVX256-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3]
-; X64_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64_AVX256-NEXT: vpbroadcastd %xmm1, %xmm1
+; X64_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
; X64_AVX256-NEXT: retq
;
; X32_AVX512-LABEL: insert_subvector_256:
; X32_AVX512: # BB#0:
-; X32_AVX512-NEXT: pushl %eax
; X32_AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32_AVX512-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; X32_AVX512-NEXT: vmovd %xmm1, (%esp)
-; X32_AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3]
-; X32_AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; X32_AVX512-NEXT: popl %eax
+; X32_AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
+; X32_AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
; X32_AVX512-NEXT: retl
;
; X64_AVX512-LABEL: insert_subvector_256:
; X64_AVX512: # BB#0:
; X64_AVX512-NEXT: vmovd %edi, %xmm1
; X64_AVX512-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
-; X64_AVX512-NEXT: vmovd %xmm1, -{{[0-9]+}}(%rsp)
-; X64_AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3]
-; X64_AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64_AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
+; X64_AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
; X64_AVX512-NEXT: retq
%ins1 = insertelement <2 x i16> undef, i16 %x0, i32 0
%ins2 = insertelement <2 x i16> %ins1, i16 %x1, i32 1
;
; X32_AVX512-LABEL: insert_subvector_512:
; X32_AVX512: # BB#0:
-; X32_AVX512-NEXT: pushl %ebp
-; X32_AVX512-NEXT: movl %esp, %ebp
-; X32_AVX512-NEXT: andl $-8, %esp
-; X32_AVX512-NEXT: subl $8, %esp
-; X32_AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X32_AVX512-NEXT: vmovlps %xmm1, (%esp)
-; X32_AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X32_AVX512-NEXT: vpinsrd $0, (%esp), %xmm1, %xmm1
-; X32_AVX512-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; X32_AVX512-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
-; X32_AVX512-NEXT: movl %ebp, %esp
-; X32_AVX512-NEXT: popl %ebp
+; X32_AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; X32_AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,0,8,0,3,0,4,0,5,0,6,0,7,0]
+; X32_AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; X32_AVX512-NEXT: retl
;
; X64_AVX512-LABEL: insert_subvector_512:
; X64_AVX512: # BB#0:
; X64_AVX512-NEXT: vmovd %edi, %xmm1
; X64_AVX512-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1
-; X64_AVX512-NEXT: vmovq %xmm1, %rax
-; X64_AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64_AVX512-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1
-; X64_AVX512-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; X64_AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,3,4,5,6,7]
+; X64_AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; X64_AVX512-NEXT: retq
%ins1 = insertelement <2 x i32> undef, i32 %x0, i32 0
%ins2 = insertelement <2 x i32> %ins1, i32 %x1, i32 1
;
; X32_AVX512-LABEL: insert_subvector_into_undef:
; X32_AVX512: # BB#0:
-; X32_AVX512-NEXT: pushl %ebp
-; X32_AVX512-NEXT: movl %esp, %ebp
-; X32_AVX512-NEXT: andl $-8, %esp
-; X32_AVX512-NEXT: subl $8, %esp
; X32_AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32_AVX512-NEXT: vmovlps %xmm0, (%esp)
-; X32_AVX512-NEXT: movl (%esp), %eax
-; X32_AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32_AVX512-NEXT: vmovd %eax, %xmm0
-; X32_AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; X32_AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
-; X32_AVX512-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
-; X32_AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32_AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; X32_AVX512-NEXT: movl %ebp, %esp
-; X32_AVX512-NEXT: popl %ebp
+; X32_AVX512-NEXT: vbroadcastsd %xmm0, %zmm0
; X32_AVX512-NEXT: retl
;
; X64_AVX512-LABEL: insert_subvector_into_undef: