From a2efe82b8160e9e72b94899cdcee21c7a6ec3797 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 29 Sep 2018 18:15:26 +0000 Subject: [PATCH] [X86] SimplifyDemandedVectorEltsForTargetNode - remove identity target shuffles before simplifying inputs By removing demanded target shuffles that simplify to zero/undef/identity before simplifying its inputs we improve chances of further simplification, as only the immediate parent user of the combined is added back to the work list - this still doesn't help us if its passed through other ops though (bitcasts....). llvm-svn: 343390 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 36 ++++++++++++++++----------------- llvm/test/CodeGen/X86/dagcombine-cse.ll | 4 +--- llvm/test/CodeGen/X86/oddshuffles.ll | 4 ++-- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6a131cd..356c445 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31732,25 +31732,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( [VT](SDValue V) { return VT != V.getValueType(); })) return false; - // Attempt to simplify inputs. - int NumSrcs = OpInputs.size(); - for (int Src = 0; Src != NumSrcs; ++Src) { - int Lo = Src * NumElts; - APInt SrcElts = APInt::getNullValue(NumElts); - for (int i = 0; i != NumElts; ++i) - if (DemandedElts[i]) { - int M = OpMask[i] - Lo; - if (0 <= M && M < NumElts) - SrcElts.setBit(M); - } - - APInt SrcUndef, SrcZero; - if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero, - TLO, Depth + 1)) - return true; - } - // Check if shuffle mask can be simplified to undef/zero/identity. + int NumSrcs = OpInputs.size(); for (int i = 0; i != NumElts; ++i) if (!DemandedElts[i]) OpMask[i] = SM_SentinelUndef; @@ -31768,6 +31751,23 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts)) return TLO.CombineTo(Op, OpInputs[Src]); + // Attempt to simplify inputs. + for (int Src = 0; Src != NumSrcs; ++Src) { + int Lo = Src * NumElts; + APInt SrcElts = APInt::getNullValue(NumElts); + for (int i = 0; i != NumElts; ++i) + if (DemandedElts[i]) { + int M = OpMask[i] - Lo; + if (0 <= M && M < NumElts) + SrcElts.setBit(M); + } + + APInt SrcUndef, SrcZero; + if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero, + TLO, Depth + 1)) + return true; + } + // Extract known zero/undef elements. // TODO - Propagate input undef/zero elts. for (int i = 0; i != NumElts; ++i) { diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll index 778040a..bf1dab3 100644 --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -9,9 +9,7 @@ define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) n ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: imull {{[0-9]+}}(%esp), %ecx ; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; X32-NEXT: movd %xmm0, %eax +; X32-NEXT: movl (%eax,%ecx), %eax ; X32-NEXT: retl ; ; X64-LABEL: t: diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 3e362fe..def822a 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1280,7 +1280,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[2,0] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[2,0] ; SSE2-NEXT: movaps %xmm8, %xmm5 @@ -1288,7 +1288,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,0,1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm10[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm10[2,0] ; SSE2-NEXT: movdqa %xmm9, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm8[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] -- 2.7.4