[InstCombine] canonicalize shift+logic+shift to reduce dependency chain

author Sanjay Patel <spatel@rotateright.com>

Thu, 7 Nov 2019 17:08:10 +0000 (12:08 -0500)

committer Sanjay Patel <spatel@rotateright.com>

Thu, 7 Nov 2019 17:09:45 +0000 (12:09 -0500)
author Sanjay Patel <spatel@rotateright.com>
Thu, 7 Nov 2019 17:08:10 +0000 (12:08 -0500)
committer Sanjay Patel <spatel@rotateright.com>
Thu, 7 Nov 2019 17:09:45 +0000 (12:09 -0500)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp

index eea890c..f35a6ce 100644 (file)
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -296,6 +296,49 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
    return BinaryOperator::Create(Instruction::And, NewShift, NewMask);
  }
  
+/// If we have a shift-by-constant of a bitwise logic op that itself has a
+/// shift-by-constant operand with identical opcode, we may be able to convert
+/// that into 2 independent shifts followed by the logic op. This eliminates a
+/// a use of an intermediate value (reduces dependency chain).
+static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I,
+                                            InstCombiner::BuilderTy &Builder) {
+  assert(I.isShift() && "Expected a shift as input");
+  auto *LogicInst = dyn_cast<BinaryOperator>(I.getOperand(0));
+  if (!LogicInst || !LogicInst->isBitwiseLogicOp() || !LogicInst->hasOneUse())
+    return nullptr;
+
+  const APInt *C0, *C1;
+  if (!match(I.getOperand(1), m_APInt(C1)))
+    return nullptr;
+
+  Instruction::BinaryOps ShiftOpcode = I.getOpcode();
+  Type *Ty = I.getType();
+
+  // Find a matching one-use shift by constant. The fold is not valid if the sum
+  // of the shift values equals or exceeds bitwidth.
+  // TODO: Remove the one-use check if the other logic operand (Y) is constant.
+  Value *X, *Y;
+  auto matchFirstShift = [&](Value *V) {
+    return match(V, m_OneUse(m_Shift(m_Value(X), m_APInt(C0)))) &&
+           cast<BinaryOperator>(V)->getOpcode() == ShiftOpcode &&
+           (*C0 + *C1).ult(Ty->getScalarSizeInBits());
+  };
+
+  // Logic ops are commutative, so check each operand for a match.
+  if (matchFirstShift(LogicInst->getOperand(0)))
+    Y = LogicInst->getOperand(1);
+  else if (matchFirstShift(LogicInst->getOperand(1)))
+    Y = LogicInst->getOperand(0);
+  else
+    return nullptr;
+
+  // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
+  Constant *ShiftSumC = ConstantInt::get(Ty, *C0 + *C1);
+  Value *NewShift1 = Builder.CreateBinOp(ShiftOpcode, X, ShiftSumC);
+  Value *NewShift2 = Builder.CreateBinOp(ShiftOpcode, Y, I.getOperand(1));
+  return BinaryOperator::Create(LogicInst->getOpcode(), NewShift1, NewShift2);
+}
+
  Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
    Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
    assert(Op0->getType() == Op1->getType());
@@ -348,6 +391,9 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
      return &I;
    }
  
+  if (Instruction *Logic = foldShiftOfShiftedLogic(I, Builder))
+    return Logic;
+
    return nullptr;
  }
  
diff --git a/llvm/test/Transforms/InstCombine/bswap.ll b/llvm/test/Transforms/InstCombine/bswap.ll

index 2d8a69b..540b175 100644 (file)
--- a/llvm/test/Transforms/InstCombine/bswap.ll
+++ b/llvm/test/Transforms/InstCombine/bswap.ll
@@ -169,10 +169,10 @@ define i32 @bswap32_shl_first(i32 %x) {
  
  define i32 @bswap32_shl_first_extra_use(i32 %x) {
  ; CHECK-LABEL: @bswap32_shl_first_extra_use(
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 16
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], 16
-; CHECK-NEXT:    [[SWAPHALF:%.*]] = or i32 [[SHL]], [[SHR]]
-; CHECK-NEXT:    [[T:%.*]] = shl i32 [[SWAPHALF]], 8
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X]], 24
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[SHR]], 8
+; CHECK-NEXT:    [[T:%.*]] = or i32 [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[BSWAP:%.*]] = call i32 @llvm.bswap.i32(i32 [[X]])
  ; CHECK-NEXT:    call void @extra_use(i32 [[T]])
  ; CHECK-NEXT:    ret i32 [[BSWAP]]
diff --git a/llvm/test/Transforms/InstCombine/shift-logic.ll b/llvm/test/Transforms/InstCombine/shift-logic.ll

index f387d1f..453463f 100644 (file)
--- a/llvm/test/Transforms/InstCombine/shift-logic.ll
+++ b/llvm/test/Transforms/InstCombine/shift-logic.ll
@@ -3,9 +3,9 @@
  
  define i8 @shl_and(i8 %x, i8 %y) {
  ; CHECK-LABEL: @shl_and(
-; CHECK-NEXT:    [[SH0:%.*]] = shl i8 [[X:%.*]], 3
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[SH0]], [[Y:%.*]]
-; CHECK-NEXT:    [[SH1:%.*]] = shl i8 [[R]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i8 [[X:%.*]], 5
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i8 [[Y:%.*]], 2
+; CHECK-NEXT:    [[SH1:%.*]] = and i8 [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    ret i8 [[SH1]]
  ;
    %sh0 = shl i8 %x, 3
@@ -17,9 +17,9 @@ define i8 @shl_and(i8 %x, i8 %y) {
  define i16 @shl_or(i16 %x, i16 %py) {
  ; CHECK-LABEL: @shl_or(
  ; CHECK-NEXT:    [[Y:%.*]] = srem i16 [[PY:%.*]], 42
-; CHECK-NEXT:    [[SH0:%.*]] = shl i16 [[X:%.*]], 5
-; CHECK-NEXT:    [[R:%.*]] = or i16 [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = shl i16 [[R]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i16 [[X:%.*]], 12
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i16 [[Y]], 7
+; CHECK-NEXT:    [[SH1:%.*]] = or i16 [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    ret i16 [[SH1]]
  ;
    %y = srem i16 %py, 42 ; thwart complexity-based canonicalization
@@ -31,9 +31,9 @@ define i16 @shl_or(i16 %x, i16 %py) {
  
  define i32 @shl_xor(i32 %x, i32 %y) {
  ; CHECK-LABEL: @shl_xor(
-; CHECK-NEXT:    [[SH0:%.*]] = shl i32 [[X:%.*]], 5
-; CHECK-NEXT:    [[R:%.*]] = xor i32 [[SH0]], [[Y:%.*]]
-; CHECK-NEXT:    [[SH1:%.*]] = shl i32 [[R]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X:%.*]], 12
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[Y:%.*]], 7
+; CHECK-NEXT:    [[SH1:%.*]] = xor i32 [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    ret i32 [[SH1]]
  ;
    %sh0 = shl i32 %x, 5
@@ -45,9 +45,9 @@ define i32 @shl_xor(i32 %x, i32 %y) {
  define i64 @lshr_and(i64 %x, i64 %py) {
  ; CHECK-LABEL: @lshr_and(
  ; CHECK-NEXT:    [[Y:%.*]] = srem i64 [[PY:%.*]], 42
-; CHECK-NEXT:    [[SH0:%.*]] = lshr i64 [[X:%.*]], 5
-; CHECK-NEXT:    [[R:%.*]] = and i64 [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = lshr i64 [[R]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[X:%.*]], 12
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[Y]], 7
+; CHECK-NEXT:    [[SH1:%.*]] = and i64 [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    ret i64 [[SH1]]
  ;
    %y = srem i64 %py, 42 ; thwart complexity-based canonicalization
@@ -59,9 +59,9 @@ define i64 @lshr_and(i64 %x, i64 %py) {
  
  define <4 x i32> @lshr_or(<4 x i32> %x, <4 x i32> %y) {
  ; CHECK-LABEL: @lshr_or(
-; CHECK-NEXT:    [[SH0:%.*]] = lshr <4 x i32> [[X:%.*]], <i32 5, i32 5, i32 5, i32 5>
-; CHECK-NEXT:    [[R:%.*]] = or <4 x i32> [[SH0]], [[Y:%.*]]
-; CHECK-NEXT:    [[SH1:%.*]] = lshr <4 x i32> [[R]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[X:%.*]], <i32 12, i32 12, i32 12, i32 12>
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <4 x i32> [[Y:%.*]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[SH1:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    ret <4 x i32> [[SH1]]
  ;
    %sh0 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
@@ -73,9 +73,9 @@ define <4 x i32> @lshr_or(<4 x i32> %x, <4 x i32> %y) {
  define <8 x i16> @lshr_xor(<8 x i16> %x, <8 x i16> %py) {
  ; CHECK-LABEL: @lshr_xor(
  ; CHECK-NEXT:    [[Y:%.*]] = srem <8 x i16> [[PY:%.*]], <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
-; CHECK-NEXT:    [[SH0:%.*]] = lshr <8 x i16> [[X:%.*]], <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
-; CHECK-NEXT:    [[R:%.*]] = xor <8 x i16> [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = lshr <8 x i16> [[R]], <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i16> [[X:%.*]], <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <8 x i16> [[Y]], <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:    [[SH1:%.*]] = xor <8 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    ret <8 x i16> [[SH1]]
  ;
    %y = srem <8 x i16> %py, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 -42> ; thwart complexity-based canonicalization
@@ -89,9 +89,9 @@ define <8 x i16> @lshr_xor(<8 x i16> %x, <8 x i16> %py) {
  define <16 x i8> @ashr_and(<16 x i8> %x, <16 x i8> %py, <16 x i8> %pz) {
  ; CHECK-LABEL: @ashr_and(
  ; CHECK-NEXT:    [[Y:%.*]] = srem <16 x i8> [[PY:%.*]], [[PZ:%.*]]
-; CHECK-NEXT:    [[SH0:%.*]] = ashr <16 x i8> [[X:%.*]], <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; CHECK-NEXT:    [[R:%.*]] = and <16 x i8> [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = ashr <16 x i8> [[R]], <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i8> [[X:%.*]], <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr <16 x i8> [[Y]], <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+; CHECK-NEXT:    [[SH1:%.*]] = and <16 x i8> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    ret <16 x i8> [[SH1]]
  ;
    %y = srem <16 x i8> %py, %pz ; thwart complexity-based canonicalization
@@ -103,9 +103,9 @@ define <16 x i8> @ashr_and(<16 x i8> %x, <16 x i8> %py, <16 x i8> %pz) {
  
  define <2 x i64> @ashr_or(<2 x i64> %x, <2 x i64> %y) {
  ; CHECK-LABEL: @ashr_or(
-; CHECK-NEXT:    [[SH0:%.*]] = ashr <2 x i64> [[X:%.*]], <i64 5, i64 5>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i64> [[SH0]], [[Y:%.*]]
-; CHECK-NEXT:    [[SH1:%.*]] = ashr <2 x i64> [[R]], <i64 7, i64 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> [[X:%.*]], <i64 12, i64 12>
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr <2 x i64> [[Y:%.*]], <i64 7, i64 7>
+; CHECK-NEXT:    [[SH1:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    ret <2 x i64> [[SH1]]
  ;
    %sh0 = ashr <2 x i64> %x, <i64 5, i64 5>
@@ -117,9 +117,9 @@ define <2 x i64> @ashr_or(<2 x i64> %x, <2 x i64> %y) {
  define i32 @ashr_xor(i32 %x, i32 %py) {
  ; CHECK-LABEL: @ashr_xor(
  ; CHECK-NEXT:    [[Y:%.*]] = srem i32 [[PY:%.*]], 42
-; CHECK-NEXT:    [[SH0:%.*]] = ashr i32 [[X:%.*]], 5
-; CHECK-NEXT:    [[R:%.*]] = xor i32 [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = ashr i32 [[R]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 12
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y]], 7
+; CHECK-NEXT:    [[SH1:%.*]] = xor i32 [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    ret i32 [[SH1]]
  ;
    %y = srem i32 %py, 42 ; thwart complexity-based canonicalization
author	Sanjay Patel <spatel@rotateright.com>
	Thu, 7 Nov 2019 17:08:10 +0000 (12:08 -0500)
committer	Sanjay Patel <spatel@rotateright.com>
	Thu, 7 Nov 2019 17:09:45 +0000 (12:09 -0500)
llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp		patch \| blob \| history
llvm/test/Transforms/InstCombine/bswap.ll		patch \| blob \| history
llvm/test/Transforms/InstCombine/shift-logic.ll		patch \| blob \| history