From 644d56b432bf6f50faa9638ec1b75e9d7ed3d6d4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 9 Feb 2020 12:25:19 +0000 Subject: [PATCH] [X86] Recognise ROTLI/ROTRI rotations as faux shuffles Allows us to combine rotations with shuffles. One of many things necessary to fix PR44379 (lowering shuffles to rotations) --- llvm/lib/Target/X86/X86ISelLowering.cpp | 18 ++++++++++++++++++ .../CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll | 6 ++---- llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll | 6 ++---- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 27363a9..d209b1e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7443,6 +7443,24 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, } return true; } + case X86ISD::VROTLI: + case X86ISD::VROTRI: { + // We can only decode 'whole byte' bit rotates as shuffles. + uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt); + if ((RotateVal % 8) != 0) + return false; + Ops.push_back(N.getOperand(0)); + int NumBytesPerElt = NumBitsPerElt / 8; + int Offset = RotateVal / 8; + Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset); + for (int i = 0; i != NumElts; ++i) { + int BaseIdx = i * NumBytesPerElt; + for (int j = 0; j != NumBytesPerElt; ++j) { + Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt)); + } + } + return true; + } case X86ISD::VBROADCAST: { SDValue Src = N.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll index fd765fe..e42691d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -79,8 +79,7 @@ define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_unpcklwd(<16 x i16> %a define <16 x i8> @combine_shuffle_vrotri_v2i64(<2 x i64> %a0) { ; CHECK-LABEL: combine_shuffle_vrotri_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vprorq $48, %xmm0, %xmm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13,12,11,10,9,8,15,14,5,4,3,2,1,0,7,6] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a0, <2 x i64> %a0, <2 x i64> ) %2 = bitcast <2 x i64> %1 to <16 x i8> @@ -92,8 +91,7 @@ declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) define <16 x i8> @combine_shuffle_vrotli_v4i32(<4 x i32> %a0) { ; CHECK-LABEL: combine_shuffle_vrotli_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vprold $8, %xmm0, %xmm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a0, <4 x i32> %a0, <4 x i32> ) %2 = bitcast <4 x i32> %1 to <16 x i8> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll index 7320036..6460c52 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -255,8 +255,7 @@ define <4 x i32> @combine_vpperm_10zz32BA(<4 x i32> %a0, <4 x i32> %a1) { define <16 x i8> @combine_shuffle_proti_v2i64(<2 x i64> %a0) { ; CHECK-LABEL: combine_shuffle_proti_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vprotq $16, %xmm0, %xmm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13,12,11,10,9,8,15,14,5,4,3,2,1,0,7,6] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a0, <2 x i64> %a0, <2 x i64> ) %2 = bitcast <2 x i64> %1 to <16 x i8> @@ -268,8 +267,7 @@ declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) define <16 x i8> @combine_shuffle_proti_v4i32(<4 x i32> %a0) { ; CHECK-LABEL: combine_shuffle_proti_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vprotd $8, %xmm0, %xmm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a0, <4 x i32> %a0, <4 x i32> ) %2 = bitcast <4 x i32> %1 to <16 x i8> -- 2.7.4