From 06f136f61e6d23fde5c91f7fa0813d0291c17c97 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 18 Sep 2020 14:53:29 -0700 Subject: [PATCH] [instcombine][x86] Converted pdep/pext with shifted mask to simple arithmetic If the mask of a pdep or pext instruction is a shift masked (i.e. one contiguous block of ones) we need at most one and and one shift to represent the operation without the intrinsic. One all platforms I know of, this is faster than the pdep/pext. The cost modelling for multiple contiguous blocks might be worth exploring in a follow up, but it's not relevant for my current use case. It would almost certainly be a win on AMDs where these are really really slow though. Differential Revision: https://reviews.llvm.org/D87861 --- llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp | 26 ++++++++++++++ .../test/Transforms/InstCombine/X86/x86-bmi-tbm.ll | 42 ++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index 94ee799..10f0018 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -999,6 +999,20 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceInstUsesWith(II, II.getArgOperand(0)); } + if (MaskC->getValue().isShiftedMask()) { + // any single contingous sequence of 1s anywhere in the mask simply + // describes a subset of the input bits shifted to the appropriate + // position. Replace with the straight forward IR. + unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); + Value *Input = II.getArgOperand(0); + Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); + Value *Shifted = IC.Builder.CreateLShr(Masked, + ConstantInt::get(II.getType(), + ShiftAmount)); + return IC.replaceInstUsesWith(II, Shifted); + } + + if (auto *SrcC = dyn_cast(II.getArgOperand(0))) { uint64_t Src = SrcC->getZExtValue(); uint64_t Mask = MaskC->getZExtValue(); @@ -1030,6 +1044,18 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (MaskC->isAllOnesValue()) { return IC.replaceInstUsesWith(II, II.getArgOperand(0)); } + if (MaskC->getValue().isShiftedMask()) { + // any single contingous sequence of 1s anywhere in the mask simply + // describes a subset of the input bits shifted to the appropriate + // position. Replace with the straight forward IR. + unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); + Value *Input = II.getArgOperand(0); + Value *Shifted = IC.Builder.CreateShl(Input, + ConstantInt::get(II.getType(), + ShiftAmount)); + Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); + return IC.replaceInstUsesWith(II, Masked); + } if (auto *SrcC = dyn_cast(II.getArgOperand(0))) { uint64_t Src = SrcC->getZExtValue(); diff --git a/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll b/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll index 8d04eca..b7f8146 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll @@ -306,6 +306,27 @@ define i64 @test_x86_pext_64_allones_mask(i64 %x) nounwind readnone { ret i64 %1 } +define i32 @test_x86_pext_32_shifted_mask(i32 %x) nounwind readnone { +; CHECK-LABEL: @test_x86_pext_32_shifted_mask( +; CHECK-NEXT: %1 = lshr i32 %x, 1 +; CHECK-NEXT: %2 = and i32 %1, 3 +; CHECK-NEXT: ret i32 %2 +; + %1 = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 6) + ret i32 %1 +} + +define i64 @test_x86_pext_64_shifted_mask(i64 %x) nounwind readnone { +; CHECK-LABEL: @test_x86_pext_64_shifted_mask( +; CHECK-NEXT: %1 = lshr i64 %x, 1 +; CHECK-NEXT: %2 = and i64 %1, 3 +; CHECK-NEXT: ret i64 %2 +; + %1 = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 6) + ret i64 %1 +} + + define i32 @test_x86_pext_32_constant_fold() nounwind readnone { ; CHECK-LABEL: @test_x86_pext_32_constant_fold( ; CHECK-NEXT: ret i32 30001 @@ -370,6 +391,27 @@ define i64 @test_x86_pdep_64_allones_mask(i64 %x) nounwind readnone { ret i64 %1 } +define i32 @test_x86_pdep_32_shifted_mask(i32 %x) nounwind readnone { +; CHECK-LABEL: @test_x86_pdep_32_shifted_mask( +; CHECK-NEXT: %1 = shl i32 %x, 2 +; CHECK-NEXT: %2 = and i32 %1, 12 +; CHECK-NEXT: ret i32 %2 +; + %1 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 12) + ret i32 %1 +} + +define i64 @test_x86_pdep_64_shifted_mask(i64 %x) nounwind readnone { +; CHECK-LABEL: @test_x86_pdep_64_shifted_mask( +; CHECK-NEXT: %1 = shl i64 %x, 2 +; CHECK-NEXT: %2 = and i64 %1, 12 +; CHECK-NEXT: ret i64 %2 +; + %1 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 12) + ret i64 %1 +} + + define i32 @test_x86_pdep_32_constant_fold() nounwind readnone { ; CHECK-LABEL: @test_x86_pdep_32_constant_fold( ; CHECK-NEXT: ret i32 807407616 -- 2.7.4