From 02f79eae06895d16d5b1d2e8803d17997437d73f Mon Sep 17 00:00:00 2001
From: Hiroshi Inoue <inouehrs@jp.ibm.com>
Date: Wed, 1 Aug 2018 04:40:32 +0000
Subject: [PATCH] [InstSimplify] fold extracting from std::pair (1/2)

This patch intends to enable jump threading when a method whose return type is std::pair<int, bool> or std::pair<bool, int> is inlined.
For example, jump threading does not happen for the if statement in func.

std::pair<int, bool> callee(int v) {
  int a = dummy(v);
  if (a) return std::make_pair(dummy(v), true);
  else return std::make_pair(v, v < 0);
}

int func(int v) {
  std::pair<int, bool> rc = callee(v);
  if (rc.second) {
    // do something
  }

SROA executed before the method inlining replaces std::pair by i64 without splitting in both callee and func since at this point no access to the individual fields is seen to SROA.
After inlining, jump threading fails to identify that the incoming value is a constant due to additional instructions (like or, and, trunc).

This series of patch add patterns in InstructionSimplify to fold extraction of members of std::pair. To help jump threading, actually we need to optimize the code sequence spanning multiple BBs.
These patches does not handle phi by itself, but these additional patterns help NewGVN pass, which calls instsimplify to check opportunities for simplifying instructions over phi, apply phi-of-ops optimization to result in successful jump threading.
SimplifyDemandedBits in InstCombine, can do more general optimization but this patch aims to provide opportunities for other optimizers by supporting a simple but common case in InstSimplify.

This first patch in the series handles code sequences that merges two values using shl and or and then extracts one value using lshr.

Differential Revision: https://reviews.llvm.org/D48828

llvm-svn: 338485
---
 llvm/lib/Analysis/InstructionSimplify.cpp  | 17 +++++++++++++++++
 llvm/test/Transforms/InstSimplify/shift.ll | 12 ++----------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 6ebae37..7fc7c15 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -1325,6 +1325,23 @@ static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
   if (match(Op0, m_NUWShl(m_Value(X), m_Specific(Op1))))
     return X;
 
+  // ((X << A) | Y) >> A -> X  if effective width of Y is not larger than A.
+  // We can return X as we do in the above case since OR alters no bits in X.
+  // SimplifyDemandedBits in InstCombine can do more general optimization for
+  // bit manipulation. This pattern aims to provide opportunities for other
+  // optimizers by supporting a simple but common case in InstSimplify.
+  Value *Y;
+  const APInt *ShRAmt, *ShLAmt;
+  if (match(Op1, m_APInt(ShRAmt)) &&
+      match(Op0, m_c_Or(m_NUWShl(m_Value(X), m_APInt(ShLAmt)), m_Value(Y))) &&
+      *ShRAmt == *ShLAmt) {
+    const KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    const unsigned Width = Op0->getType()->getScalarSizeInBits();
+    const unsigned EffWidthY = Width - YKnown.countMinLeadingZeros();
+    if (EffWidthY <= ShRAmt->getZExtValue())
+      return X;
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/shift.ll b/llvm/test/Transforms/InstSimplify/shift.ll
index f49c76b..cbffd37 100644
--- a/llvm/test/Transforms/InstSimplify/shift.ll
+++ b/llvm/test/Transforms/InstSimplify/shift.ll
@@ -178,11 +178,7 @@ define <2 x i8> @shl_by_sext_bool_vec(<2 x i1> %x, <2 x i8> %y) {
 define i64 @shl_or_shr(i32 %a, i32 %b) {
 ; CHECK-LABEL: @shl_or_shr(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[A:%.*]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[B:%.*]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP1]], 32
-; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP4]], 32
-; CHECK-NEXT:    ret i64 [[TMP5]]
+; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
   %tmp1 = zext i32 %a to i64
   %tmp2 = zext i32 %b to i64
@@ -214,11 +210,7 @@ define i64 @shl_or_shr2(i32 %a, i32 %b) {
 define <2 x i64> @shl_or_shr1v(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: @shl_or_shr1v(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i64>
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <2 x i64> [[TMP1]], <i64 32, i64 32>
-; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr <2 x i64> [[TMP4]], <i64 32, i64 32>
-; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %tmp1 = zext <2 x i32> %a to <2 x i64>
   %tmp2 = zext <2 x i32> %b to <2 x i64>
-- 
2.7.4