[DAGCombiner] convert insertelement of bitcasted vector into shuffle

author Sanjay Patel <spatel@rotateright.com>

Wed, 11 Oct 2017 14:12:16 +0000 (14:12 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Wed, 11 Oct 2017 14:12:16 +0000 (14:12 +0000)
author Sanjay Patel <spatel@rotateright.com>
Wed, 11 Oct 2017 14:12:16 +0000 (14:12 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Wed, 11 Oct 2017 14:12:16 +0000 (14:12 +0000)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index fef1c8f749cef4d965cae7a85a4458edd29b08d5..50a29da533f1ec4352c0b51953399ad8f03f5845 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -415,6 +415,7 @@ namespace {
      SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
      SDValue CombineExtLoad(SDNode *N);
      SDValue combineRepeatedFPDivisors(SDNode *N);
+    SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
      SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
      SDValue BuildSDIV(SDNode *N);
      SDValue BuildSDIVPow2(SDNode *N);
@@ -13747,6 +13748,60 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
    return St1;
  }
  
+/// Convert a disguised subvector insertion into a shuffle:
+/// insert_vector_elt V, (bitcast X from vector type), IdxC -->
+/// bitcast(shuffle (bitcast V), (extended X), Mask)
+/// Note: We do not use an insert_subvector node because that requires a legal
+/// subvector type.
+SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
+  SDValue InsertVal = N->getOperand(1);
+  if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() ||
+      !InsertVal.getOperand(0).getValueType().isVector())
+    return SDValue();
+
+  SDValue SubVec = InsertVal.getOperand(0);
+  SDValue DestVec = N->getOperand(0);
+  EVT SubVecVT = SubVec.getValueType();
+  EVT VT = DestVec.getValueType();
+  unsigned NumSrcElts = SubVecVT.getVectorNumElements();
+  unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
+  unsigned NumMaskVals = ExtendRatio * NumSrcElts;
+
+  // Step 1: Create a shuffle mask that implements this insert operation. The
+  // vector that we are inserting into will be operand 0 of the shuffle, so
+  // those elements are just 'i'. The inserted subvector is in the first
+  // positions of operand 1 of the shuffle. Example:
+  // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}
+  SmallVector<int, 16> Mask(NumMaskVals);
+  for (unsigned i = 0; i != NumMaskVals; ++i) {
+    if (i / NumSrcElts == InsIndex)
+      Mask[i] = (i % NumSrcElts) + NumMaskVals;
+    else
+      Mask[i] = i;
+  }
+
+  // Bail out if the target can not handle the shuffle we want to create.
+  EVT SubVecEltVT = SubVecVT.getVectorElementType();
+  EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals);
+  if (!TLI.isShuffleMaskLegal(Mask, ShufVT))
+    return SDValue();
+
+  // Step 2: Create a wide vector from the inserted source vector by appending
+  // undefined elements. This is the same size as our destination vector.
+  SDLoc DL(N);
+  SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT));
+  ConcatOps[0] = SubVec;
+  SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps);
+
+  // Step 3: Shuffle in the padded subvector.
+  SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec);
+  SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask);
+  AddToWorklist(PaddedSubV.getNode());
+  AddToWorklist(DestVecBC.getNode());
+  AddToWorklist(Shuf.getNode());
+  return DAG.getBitcast(VT, Shuf);
+}
+
  SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
    SDValue InVec = N->getOperand(0);
    SDValue InVal = N->getOperand(1);
@@ -13765,10 +13820,14 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
        InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
      return InVec;
  
-  // Check that we know which element is being inserted
-  if (!isa<ConstantSDNode>(EltNo))
+  // We must know which element is being inserted for folds below here.
+  auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
+  if (!IndexC)
      return SDValue();
-  unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+  unsigned Elt = IndexC->getZExtValue();
+
+  if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
+    return Shuf;
  
    // Canonicalize insert_vector_elt dag nodes.
    // Example:
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll

index 2585676e1bd290284e9b0de116b04c89884aa7af..1a1a20b3d1f656353973b25e4b7928b577e12f28 100644 (file)
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -140,7 +140,7 @@ define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
  
  define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
  ; CHECK-LABEL: ins1f2:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    %tmp3 = extractelement <1 x double> %tmp1, i32 0
    %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
    ret <2 x double> %tmp4
diff --git a/llvm/test/CodeGen/X86/insertelement-shuffle.ll b/llvm/test/CodeGen/X86/insertelement-shuffle.ll

index 2549af813016c92447e3a6364c42f76a6a07c7a2..fb01e18cd71597fbdd65a91866ba321da422e287 100644 (file)
--- a/llvm/test/CodeGen/X86/insertelement-shuffle.ll
+++ b/llvm/test/CodeGen/X86/insertelement-shuffle.ll
@@ -7,42 +7,34 @@
  define <8 x float> @insert_subvector_256(i16 %x0, i16 %x1, <8 x float> %v) nounwind {
  ; X32_AVX256-LABEL: insert_subvector_256:
  ; X32_AVX256:       # BB#0:
-; X32_AVX256-NEXT:    pushl %eax
  ; X32_AVX256-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
  ; X32_AVX256-NEXT:    vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; X32_AVX256-NEXT:    vmovd %xmm1, (%esp)
-; X32_AVX256-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3]
-; X32_AVX256-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; X32_AVX256-NEXT:    popl %eax
+; X32_AVX256-NEXT:    vpbroadcastd %xmm1, %xmm1
+; X32_AVX256-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
  ; X32_AVX256-NEXT:    retl
  ;
  ; X64_AVX256-LABEL: insert_subvector_256:
  ; X64_AVX256:       # BB#0:
  ; X64_AVX256-NEXT:    vmovd %edi, %xmm1
  ; X64_AVX256-NEXT:    vpinsrw $1, %esi, %xmm1, %xmm1
-; X64_AVX256-NEXT:    vmovd %xmm1, -{{[0-9]+}}(%rsp)
-; X64_AVX256-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3]
-; X64_AVX256-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64_AVX256-NEXT:    vpbroadcastd %xmm1, %xmm1
+; X64_AVX256-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
  ; X64_AVX256-NEXT:    retq
  ;
  ; X32_AVX512-LABEL: insert_subvector_256:
  ; X32_AVX512:       # BB#0:
-; X32_AVX512-NEXT:    pushl %eax
  ; X32_AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
  ; X32_AVX512-NEXT:    vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; X32_AVX512-NEXT:    vmovd %xmm1, (%esp)
-; X32_AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3]
-; X32_AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; X32_AVX512-NEXT:    popl %eax
+; X32_AVX512-NEXT:    vpbroadcastd %xmm1, %xmm1
+; X32_AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
  ; X32_AVX512-NEXT:    retl
  ;
  ; X64_AVX512-LABEL: insert_subvector_256:
  ; X64_AVX512:       # BB#0:
  ; X64_AVX512-NEXT:    vmovd %edi, %xmm1
  ; X64_AVX512-NEXT:    vpinsrw $1, %esi, %xmm1, %xmm1
-; X64_AVX512-NEXT:    vmovd %xmm1, -{{[0-9]+}}(%rsp)
-; X64_AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3]
-; X64_AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64_AVX512-NEXT:    vpbroadcastd %xmm1, %xmm1
+; X64_AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
  ; X64_AVX512-NEXT:    retq
    %ins1 = insertelement <2 x i16> undef, i16 %x0, i32 0
    %ins2 = insertelement <2 x i16> %ins1, i16 %x1, i32 1
@@ -80,28 +72,17 @@ define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind
  ;
  ; X32_AVX512-LABEL: insert_subvector_512:
  ; X32_AVX512:       # BB#0:
-; X32_AVX512-NEXT:    pushl %ebp
-; X32_AVX512-NEXT:    movl %esp, %ebp
-; X32_AVX512-NEXT:    andl $-8, %esp
-; X32_AVX512-NEXT:    subl $8, %esp
-; X32_AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X32_AVX512-NEXT:    vmovlps %xmm1, (%esp)
-; X32_AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X32_AVX512-NEXT:    vpinsrd $0, (%esp), %xmm1, %xmm1
-; X32_AVX512-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; X32_AVX512-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
-; X32_AVX512-NEXT:    movl %ebp, %esp
-; X32_AVX512-NEXT:    popl %ebp
+; X32_AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X32_AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,1,0,8,0,3,0,4,0,5,0,6,0,7,0]
+; X32_AVX512-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
  ; X32_AVX512-NEXT:    retl
  ;
  ; X64_AVX512-LABEL: insert_subvector_512:
  ; X64_AVX512:       # BB#0:
  ; X64_AVX512-NEXT:    vmovd %edi, %xmm1
  ; X64_AVX512-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
-; X64_AVX512-NEXT:    vmovq %xmm1, %rax
-; X64_AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64_AVX512-NEXT:    vpinsrq $0, %rax, %xmm1, %xmm1
-; X64_AVX512-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; X64_AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,8,3,4,5,6,7]
+; X64_AVX512-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
  ; X64_AVX512-NEXT:    retq
    %ins1 = insertelement <2 x i32> undef, i32 %x0, i32 0
    %ins2 = insertelement <2 x i32> %ins1, i32 %x1, i32 1
@@ -144,22 +125,8 @@ define <8 x i64> @insert_subvector_into_undef(i32 %x0, i32 %x1) nounwind {
  ;
  ; X32_AVX512-LABEL: insert_subvector_into_undef:
  ; X32_AVX512:       # BB#0:
-; X32_AVX512-NEXT:    pushl %ebp
-; X32_AVX512-NEXT:    movl %esp, %ebp
-; X32_AVX512-NEXT:    andl $-8, %esp
-; X32_AVX512-NEXT:    subl $8, %esp
  ; X32_AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32_AVX512-NEXT:    vmovlps %xmm0, (%esp)
-; X32_AVX512-NEXT:    movl (%esp), %eax
-; X32_AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32_AVX512-NEXT:    vmovd %eax, %xmm0
-; X32_AVX512-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
-; X32_AVX512-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
-; X32_AVX512-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
-; X32_AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32_AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; X32_AVX512-NEXT:    movl %ebp, %esp
-; X32_AVX512-NEXT:    popl %ebp
+; X32_AVX512-NEXT:    vbroadcastsd %xmm0, %zmm0
  ; X32_AVX512-NEXT:    retl
  ;
  ; X64_AVX512-LABEL: insert_subvector_into_undef:
author	Sanjay Patel <spatel@rotateright.com>
	Wed, 11 Oct 2017 14:12:16 +0000 (14:12 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Wed, 11 Oct 2017 14:12:16 +0000 (14:12 +0000)
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/arm64-neon-copy.ll		patch \| blob \| history
llvm/test/CodeGen/X86/insertelement-shuffle.ll		patch \| blob \| history