From e744f513c4d75c346f2daeb6dfebbd15ffa6bae7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 19 Mar 2019 17:23:25 +0000 Subject: [PATCH] [X86][SSE] SimplifyDemandedVectorEltsForTargetNode - handle repeated shift amounts If a value with multiple uses is only ever used for SSE shift amounts then we know that only the bottom 64-bits are needed. llvm-svn: 356483 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 12 ++++- llvm/test/CodeGen/X86/vector-rotate-512.ll | 54 +++++++++++------------ llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll | 22 ++++----- 3 files changed, 48 insertions(+), 40 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index fc25191..8ae505f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -33041,11 +33041,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue Amt = Op.getOperand(1); MVT AmtVT = Amt.getSimpleValueType(); assert(AmtVT.is128BitVector() && "Unexpected value type"); + + // If we reuse the shift amount just for sse shift amounts then we know that + // only the bottom 64-bits are only ever used. + bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) { + unsigned Opc = Use->getOpcode(); + return (Opc == X86ISD::VSHL || Opc == X86ISD::VSRL || + Opc == X86ISD::VSRA) && + Use->getOperand(0) != Amt; + }); + APInt AmtUndef, AmtZero; unsigned NumAmtElts = AmtVT.getVectorNumElements(); APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2); if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO, - Depth + 1)) + Depth + 1, AssumeSingleUse)) return true; LLVM_FALLTHROUGH; } diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll index 1d177c1..68249c2 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -425,42 +425,40 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: splatvar_rotate_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm2 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 -; AVX512BW-NEXT: vpsllw %xmm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlw %xmm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsllw %xmm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_rotate_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm2 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm3 ; AVX512VLBW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 -; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm4, %zmm1 -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1 -; AVX512VLBW-NEXT: vpandq %zmm1, %zmm3, %zmm1 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm3, %xmm2 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm4, %zmm2 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm4, %zmm2 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VLBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpandq %zmm2, %zmm3, %zmm2 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm4, %zmm1 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer %splat8 = sub <64 x i8> , %splat diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index cc4669b..fa4f4ea 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1057,21 +1057,21 @@ define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; X32-SSE-NEXT: psrad $31, %xmm0 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3 -; X32-SSE-NEXT: xorps %xmm4, %xmm4 -; X32-SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlq %xmm4, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,0,4294967295,0] +; X32-SSE-NEXT: pand %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648] +; X32-SSE-NEXT: movdqa %xmm4, %xmm0 ; X32-SSE-NEXT: psrlq %xmm3, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; X32-SSE-NEXT: xorps %xmm5, %xmm5 +; X32-SSE-NEXT: movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3] +; X32-SSE-NEXT: psrlq %xmm5, %xmm4 +; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; X32-SSE-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE-NEXT: psrlq %xmm4, %xmm0 +; X32-SSE-NEXT: psrlq %xmm5, %xmm0 ; X32-SSE-NEXT: psrlq %xmm3, %xmm2 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; X32-SSE-NEXT: xorpd %xmm1, %xmm0 -; X32-SSE-NEXT: psubq %xmm1, %xmm0 +; X32-SSE-NEXT: xorpd %xmm4, %xmm0 +; X32-SSE-NEXT: psubq %xmm4, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer %shift = ashr <2 x i32> %a, %splat -- 2.7.4