From 06f136f61e6d23fde5c91f7fa0813d0291c17c97 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 18 Sep 2020 14:53:29 -0700
Subject: [PATCH] [instcombine][x86] Converted pdep/pext with shifted mask to
 simple arithmetic

If the mask of a pdep or pext instruction is a shift masked (i.e. one contiguous block of ones) we need at most one and and one shift to represent the operation without the intrinsic. One all platforms I know of, this is faster than the pdep/pext.

The cost modelling for multiple contiguous blocks might be worth exploring in a follow up, but it's not relevant for my current use case. It would almost certainly be a win on AMDs where these are really really slow though.

Differential Revision: https://reviews.llvm.org/D87861
---
 llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp    | 26 ++++++++++++++
 .../test/Transforms/InstCombine/X86/x86-bmi-tbm.ll | 42 ++++++++++++++++++++++
 2 files changed, 68 insertions(+)
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 94ee799..10f0018 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -999,6 +999,20 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
       }
 
+      if (MaskC->getValue().isShiftedMask()) {
+        // any single contingous sequence of 1s anywhere in the mask simply
+        // describes a subset of the input bits shifted to the appropriate
+        // position.  Replace with the straight forward IR.
+        unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
+        Value *Input = II.getArgOperand(0);
+        Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
+        Value *Shifted = IC.Builder.CreateLShr(Masked,
+                                               ConstantInt::get(II.getType(),
+                                                                ShiftAmount));
+        return IC.replaceInstUsesWith(II, Shifted);
+      }
+
+
       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
         uint64_t Src = SrcC->getZExtValue();
         uint64_t Mask = MaskC->getZExtValue();
@@ -1030,6 +1044,18 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       if (MaskC->isAllOnesValue()) {
         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
       }
+      if (MaskC->getValue().isShiftedMask()) {
+        // any single contingous sequence of 1s anywhere in the mask simply
+        // describes a subset of the input bits shifted to the appropriate
+        // position.  Replace with the straight forward IR.
+        unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
+        Value *Input = II.getArgOperand(0);
+        Value *Shifted = IC.Builder.CreateShl(Input,
+                                              ConstantInt::get(II.getType(),
+                                                               ShiftAmount));
+        Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
+        return IC.replaceInstUsesWith(II, Masked);
+      }
 
       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
         uint64_t Src = SrcC->getZExtValue();
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll b/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll
index 8d04eca..b7f8146 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll
@@ -306,6 +306,27 @@ define i64 @test_x86_pext_64_allones_mask(i64 %x) nounwind readnone {
   ret i64 %1
 }
 
+define i32 @test_x86_pext_32_shifted_mask(i32 %x) nounwind readnone {
+; CHECK-LABEL: @test_x86_pext_32_shifted_mask(
+; CHECK-NEXT:    %1 = lshr i32 %x, 1
+; CHECK-NEXT:    %2 = and i32 %1, 3
+; CHECK-NEXT:    ret i32 %2
+;
+  %1 = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 6)
+  ret i32 %1
+}
+
+define i64 @test_x86_pext_64_shifted_mask(i64 %x) nounwind readnone {
+; CHECK-LABEL: @test_x86_pext_64_shifted_mask(
+; CHECK-NEXT:    %1 = lshr i64 %x, 1
+; CHECK-NEXT:    %2 = and i64 %1, 3
+; CHECK-NEXT:    ret i64 %2
+;
+  %1 = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 6)
+  ret i64 %1
+}
+
+
 define i32 @test_x86_pext_32_constant_fold() nounwind readnone {
 ; CHECK-LABEL: @test_x86_pext_32_constant_fold(
 ; CHECK-NEXT:    ret i32 30001
@@ -370,6 +391,27 @@ define i64 @test_x86_pdep_64_allones_mask(i64 %x) nounwind readnone {
   ret i64 %1
 }
 
+define i32 @test_x86_pdep_32_shifted_mask(i32 %x) nounwind readnone {
+; CHECK-LABEL: @test_x86_pdep_32_shifted_mask(
+; CHECK-NEXT:    %1 = shl i32 %x, 2
+; CHECK-NEXT:    %2 = and i32 %1, 12
+; CHECK-NEXT:    ret i32 %2
+;
+  %1 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 12)
+  ret i32 %1
+}
+
+define i64 @test_x86_pdep_64_shifted_mask(i64 %x) nounwind readnone {
+; CHECK-LABEL: @test_x86_pdep_64_shifted_mask(
+; CHECK-NEXT:    %1 = shl i64 %x, 2
+; CHECK-NEXT:    %2 = and i64 %1, 12
+; CHECK-NEXT:    ret i64 %2
+;
+  %1 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 12)
+  ret i64 %1
+}
+
+
 define i32 @test_x86_pdep_32_constant_fold() nounwind readnone {
 ; CHECK-LABEL: @test_x86_pdep_32_constant_fold(
 ; CHECK-NEXT:    ret i32 807407616
-- 
2.7.4