From: Roman Lebedev Date: Wed, 4 Jan 2023 17:43:42 +0000 (+0300) Subject: [Codegen][X86] `LowerBUILD_VECTOR()`: improve lowering w/ multiple FREEZE-UNDEF ops X-Git-Tag: upstream/17.0.6~22143 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e4b260efb2c8dca3035597d30ffd066ffb77daea;p=platform%2Fupstream%2Fllvm.git [Codegen][X86] `LowerBUILD_VECTOR()`: improve lowering w/ multiple FREEZE-UNDEF ops While we have great handling for UNDEF operands, FREEZE-UNDEF operands are effectively normal operands. We are better off "interleaving" such BUILD_VECTORS into a blend between a splat of FREEZE-UNDEF, and "thawed" source BUILD_VECTOR, both of which are more natural for us to handle. Refs. https://github.com/llvm/llvm-project/commit/f738ab9075f838dd4365adf3a92ca1acced114d7#r95017306 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c0aeed8..1babfa2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -11137,6 +11137,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); + MVT OpEltVT = Op.getOperand(0).getSimpleValueType(); unsigned NumElems = Op.getNumOperands(); // Generate vectors for predicate vectors. @@ -11151,6 +11152,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned EVTBits = EltVT.getSizeInBits(); APInt UndefMask = APInt::getZero(NumElems); + APInt FrozenUndefMask = APInt::getZero(NumElems); APInt ZeroMask = APInt::getZero(NumElems); APInt NonZeroMask = APInt::getZero(NumElems); bool IsAllConstants = true; @@ -11162,6 +11164,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { UndefMask.setBit(i); continue; } + if (Elt.getOpcode() == ISD::FREEZE && Elt.getOperand(0).isUndef()) { + FrozenUndefMask.setBit(i); + continue; + } Values.insert(Elt); if (!isa(Elt) && !isa(Elt)) { IsAllConstants = false; @@ -11175,11 +11181,37 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } // All undef vector. Return an UNDEF. All zero vectors were handled above. - if (NonZeroMask == 0) { + unsigned NumFrozenUndefElts = FrozenUndefMask.countPopulation(); + if (NonZeroMask == 0 && NumFrozenUndefElts != NumElems) { assert(UndefMask.isAllOnes() && "Fully undef mask expected"); return DAG.getUNDEF(VT); } + // If we have multiple FREEZE-UNDEF operands, we are likely going to end up + // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in + // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR, + // and blend the FREEZE-UNDEF operands back in. + // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand? + if (NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) { + SmallVector BlendMask(NumElems, -1); + SmallVector Elts(NumElems, DAG.getUNDEF(OpEltVT)); + for (unsigned i = 0; i < NumElems; ++i) { + if (UndefMask[i]) { + BlendMask[i] = -1; + continue; + } + BlendMask[i] = i; + if (!FrozenUndefMask[i]) + Elts[i] = Op.getOperand(i); + else + BlendMask[i] += NumElems; + } + SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts); + SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT)); + SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt); + return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask); + } + BuildVectorSDNode *BV = cast(Op.getNode()); // If the upper elts of a ymm/zmm are undef/zero then we might be better off diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll index 1dec0a4..1ed5ab2 100644 --- a/llvm/test/CodeGen/X86/freeze-vector.ll +++ b/llvm/test/CodeGen/X86/freeze-vector.ll @@ -337,17 +337,17 @@ define void @freeze_two_frozen_buildvectors(ptr %origin0, ptr %origin1, ptr %dst ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl (%edx), %edx ; X86-NEXT: andl $15, %edx -; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm1 -; X86-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; X86-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; X86-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7] -; X86-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-NEXT: vmovd %edx, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; X86-NEXT: vmovd %eax, %xmm2 +; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X86-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7] +; X86-NEXT: vpand %xmm3, %xmm1, %xmm1 ; X86-NEXT: vmovdqa %xmm1, (%ecx) -; X86-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X86-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 -; X86-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; X86-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; X86-NEXT: vpand %xmm3, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%eax) ; X86-NEXT: retl ; @@ -356,16 +356,15 @@ define void @freeze_two_frozen_buildvectors(ptr %origin0, ptr %origin1, ptr %dst ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: andl $15, %eax ; X64-NEXT: vmovd %eax, %xmm0 -; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1 -; X64-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; X64-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] -; X64-NEXT: vpand %xmm2, %xmm1, %xmm1 -; X64-NEXT: vmovdqa %xmm1, (%rdx) -; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X64-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X64-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; X64-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpbroadcastd %xmm0, %xmm0 +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpbroadcastd %xmm1, %xmm1 +; X64-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3] +; X64-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7] +; X64-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X64-NEXT: vmovdqa %xmm2, (%rdx) +; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] +; X64-NEXT: vpand %xmm3, %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%rcx) ; X64-NEXT: retq %i0.src = load i32, ptr %origin0 @@ -392,15 +391,15 @@ define void @freeze_two_buildvectors_only_one_frozen(ptr %origin0, ptr %origin1, ; X86-NEXT: movl (%edx), %edx ; X86-NEXT: andl $15, %edx ; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X86-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; X86-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7] -; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-NEXT: vmovd %edx, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; X86-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7] +; X86-NEXT: vpand %xmm2, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%ecx) -; X86-NEXT: vmovd %edx, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; X86-NEXT: vpand %xmm2, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%eax) ; X86-NEXT: retl ; @@ -409,15 +408,14 @@ define void @freeze_two_buildvectors_only_one_frozen(ptr %origin0, ptr %origin1, ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: andl $15, %eax ; X64-NEXT: vmovd %eax, %xmm0 -; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X64-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X64-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-NEXT: vmovdqa %xmm0, (%rdx) -; X64-NEXT: vmovd %eax, %xmm0 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpbroadcastd %xmm1, %xmm1 +; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] +; X64-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X64-NEXT: vmovdqa %xmm0, (%rdx) +; X64-NEXT: vpand %xmm2, %xmm1, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%rcx) ; X64-NEXT: retq %i0.src = load i32, ptr %origin0 @@ -443,30 +441,24 @@ define void @freeze_two_buildvectors_one_undef_elt(ptr %origin0, ptr %origin1, p ; X86-NEXT: movl (%edx), %edx ; X86-NEXT: andl $15, %edx ; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; X86-NEXT: vmovd %edx, %xmm1 ; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X86-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] ; X86-NEXT: vmovddup {{.*#+}} xmm2 = [7,7] ; X86-NEXT: # xmm2 = mem[0,0] ; X86-NEXT: vpand %xmm2, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%ecx) -; X86-NEXT: vpand %xmm2, %xmm1, %xmm0 +; X86-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X86-NEXT: vpand %xmm2, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: freeze_two_buildvectors_one_undef_elt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: andl $15, %eax -; X64-NEXT: vmovq %rax, %xmm0 -; X64-NEXT: vmovq %rax, %xmm1 -; X64-NEXT: vpbroadcastq %xmm1, %xmm1 -; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7] -; X64-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X64-NEXT: vmovd %eax, %xmm0 +; X64-NEXT: vpbroadcastd %xmm0, %xmm0 +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%rdx) -; X64-NEXT: vpand %xmm2, %xmm1, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%rcx) ; X64-NEXT: retq %i0.src = load i64, ptr %origin0