[EarlyCSE] Exploit open ended invariant.start scopes

author Philip Reames <listmail@philipreames.com>

Wed, 14 Mar 2018 21:35:06 +0000 (21:35 +0000)

committer Philip Reames <listmail@philipreames.com>

Wed, 14 Mar 2018 21:35:06 +0000 (21:35 +0000)
author Philip Reames <listmail@philipreames.com>
Wed, 14 Mar 2018 21:35:06 +0000 (21:35 +0000)
committer Philip Reames <listmail@philipreames.com>
Wed, 14 Mar 2018 21:35:06 +0000 (21:35 +0000)
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp

index 27f1372..a3ac0b5 100644 (file)
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -356,6 +356,9 @@ public:
      unsigned Generation = 0;
      int MatchingId = -1;
      bool IsAtomic = false;
+
+    // TODO: Remove this flag.  It would be strictly stronger to add a record
+    // to the AvailableInvariant table when passing the invariant load instead.
      bool IsInvariant = false;
  
      LoadValue() = default;
@@ -373,6 +376,17 @@ public:
                        LoadMapAllocator>;
  
    LoadHTType AvailableLoads;
+  
+  // A scoped hash table mapping memory locations (represented as typed
+  // addresses) to generation numbers at which that memory location became
+  // (henceforth indefinitely) invariant.
+  using InvariantMapAllocator =
+      RecyclingAllocator<BumpPtrAllocator,
+                         ScopedHashTableVal<MemoryLocation, unsigned>>;
+  using InvariantHTType =
+      ScopedHashTable<MemoryLocation, unsigned, DenseMapInfo<MemoryLocation>,
+                      InvariantMapAllocator>;
+  InvariantHTType AvailableInvariants;
  
    /// \brief A scoped hash table of the current values of read-only call
    /// values.
@@ -401,15 +415,16 @@ private:
    class NodeScope {
    public:
      NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
-              CallHTType &AvailableCalls)
-        : Scope(AvailableValues), LoadScope(AvailableLoads),
-          CallScope(AvailableCalls) {}
+              InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
+      : Scope(AvailableValues), LoadScope(AvailableLoads),
+        InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
      NodeScope(const NodeScope &) = delete;
      NodeScope &operator=(const NodeScope &) = delete;
  
    private:
      ScopedHTType::ScopeTy Scope;
      LoadHTType::ScopeTy LoadScope;
+    InvariantHTType::ScopeTy InvariantScope;
      CallHTType::ScopeTy CallScope;
    };
  
@@ -420,10 +435,13 @@ private:
    class StackNode {
    public:
      StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
-              CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
-              DomTreeNode::iterator child, DomTreeNode::iterator end)
+              InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
+              unsigned cg, DomTreeNode *n, DomTreeNode::iterator child,
+              DomTreeNode::iterator end)
          : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
-          EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls)
+          EndIter(end),
+          Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
+                 AvailableCalls)
            {}
      StackNode(const StackNode &) = delete;
      StackNode &operator=(const StackNode &) = delete;
@@ -563,6 +581,10 @@ private:
                                                   ExpectedType);
    }
  
+  /// Return true if the instruction is known to only operate on memory
+  /// provably invariant in the given "generation".
+  bool isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt);
+
    bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
                             Instruction *EarlierInst, Instruction *LaterInst);
  
@@ -656,6 +678,27 @@ bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
    return MSSA->dominates(LaterDef, EarlierMA);
  }
  
+bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
+  // A location loaded from with an invariant_load is assumed to *never* change
+  // within the visible scope of the compilation.
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    if (LI->getMetadata(LLVMContext::MD_invariant_load))
+      return true;
+
+  auto MemLocOpt = MemoryLocation::getOrNone(I);
+  if (!MemLocOpt)
+    // "target" intrinsic forms of loads aren't currently known to
+    // MemoryLocation::get.  TODO
+    return false;
+  MemoryLocation MemLoc = *MemLocOpt;
+  if (!AvailableInvariants.count(MemLoc))
+    return false;
+
+  // Is the generation at which this became invariant older than the
+  // current one?
+  return AvailableInvariants.lookup(MemLoc) <= GenAt;
+}
+
  bool EarlyCSE::processNode(DomTreeNode *Node) {
    bool Changed = false;
    BasicBlock *BB = Node->getBlock();
@@ -741,18 +784,28 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
        continue;
      }
  
-    // Skip invariant.start intrinsics since they only read memory, and we can
-    // forward values across it. Also, we dont need to consume the last store
-    // since the semantics of invariant.start allow us to perform DSE of the
-    // last store, if there was a store following invariant.start. Consider:
+    // We can skip all invariant.start intrinsics since they only read memory,
+    // and we can forward values across it. For invariant starts without
+    // invariant ends, we can use the fact that the invariantness never ends to
+    // start a scope in the current generaton which is true for all future
+    // generations.  Also, we dont need to consume the last store since the
+    // semantics of invariant.start allow us to perform   DSE of the last
+    // store, if there was a store following invariant.start. Consider: 
      //
      // store 30, i8* p
      // invariant.start(p)
      // store 40, i8* p
      // We can DSE the store to 30, since the store 40 to invariant location p
      // causes undefined behaviour.
-    if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>()))
+    if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>())) {
+      // If there are any uses, the scope might end.  
+      if (!Inst->use_empty())
+        continue;
+      auto *CI = cast<CallInst>(Inst);
+      MemoryLocation MemLoc = MemoryLocation::getForArgument(CI, 1, TLI);
+      AvailableInvariants.insert(MemLoc, CurrentGeneration);
        continue;
+    }
  
      if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) {
        if (auto *CondI =
@@ -850,7 +903,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
            !MemInst.isVolatile() && MemInst.isUnordered() &&
            // We can't replace an atomic load with one which isn't also atomic.
            InVal.IsAtomic >= MemInst.isAtomic() &&
-          (InVal.IsInvariant || MemInst.isInvariantLoad() ||
+          (InVal.IsInvariant ||
+           isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
             isSameMemGeneration(InVal.Generation, CurrentGeneration,
                                 InVal.DefInst, Inst))) {
          Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType());
@@ -934,8 +988,9 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
            InVal.MatchingId == MemInst.getMatchingId() &&
            // We don't yet handle removing stores with ordering of any kind.
            !MemInst.isVolatile() && MemInst.isUnordered() &&
-          isSameMemGeneration(InVal.Generation, CurrentGeneration,
-                              InVal.DefInst, Inst)) {
+          (isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
+           isSameMemGeneration(InVal.Generation, CurrentGeneration,
+                               InVal.DefInst, Inst))) {
          // It is okay to have a LastStore to a different pointer here if MemorySSA
          // tells us that the load and store are from the same memory generation.
          // In that case, LastStore should keep its present value since we're
@@ -1027,8 +1082,9 @@ bool EarlyCSE::run() {
  
    // Process the root node.
    nodesToProcess.push_back(new StackNode(
-      AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration,
-      DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end()));
+      AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
+      CurrentGeneration, DT.getRootNode(),
+      DT.getRootNode()->begin(), DT.getRootNode()->end()));
  
    // Save the current generation.
    unsigned LiveOutGeneration = CurrentGeneration;
@@ -1052,9 +1108,9 @@ bool EarlyCSE::run() {
        // Push the next child onto the stack.
        DomTreeNode *child = NodeToProcess->nextChild();
        nodesToProcess.push_back(
-          new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
-                        NodeToProcess->childGeneration(), child, child->begin(),
-                        child->end()));
+          new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
+                        AvailableCalls, NodeToProcess->childGeneration(),
+                        child, child->begin(), child->end()));
      } else {
        // It has been processed, and there are no more children to process,
        // so delete it and pop it off the stack.
diff --git a/llvm/test/Transforms/EarlyCSE/invariant-loads.ll b/llvm/test/Transforms/EarlyCSE/invariant-loads.ll

index faa4b53..e866bb9 100644 (file)
--- a/llvm/test/Transforms/EarlyCSE/invariant-loads.ll
+++ b/llvm/test/Transforms/EarlyCSE/invariant-loads.ll
@@ -94,3 +94,12 @@ merge:
    call void @clobber_and_use(i32 %val1)
    ret void
  }
+
+define void @test_false_negative_dse(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_false_negative_dse
+; CHECK: store
+  %v1 = load i32, i32* %p, !invariant.load !{}
+  call void @clobber_and_use(i32 %v1)
+  store i32 %v1, i32* %p
+  ret void
+}
diff --git a/llvm/test/Transforms/EarlyCSE/invariant.start.ll b/llvm/test/Transforms/EarlyCSE/invariant.start.ll

index 8b468a5..5fabd0c 100644 (file)
--- a/llvm/test/Transforms/EarlyCSE/invariant.start.ll
+++ b/llvm/test/Transforms/EarlyCSE/invariant.start.ll
@@ -6,13 +6,12 @@ declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) nounwind
  
  ; Check that we do load-load forwarding over invariant.start, since it does not
  ; clobber memory
-define i8 @test1(i8 *%P) {
-  ; CHECK-LABEL: @test1(
+define i8 @test_bypass1(i8 *%P) {
+  ; CHECK-LABEL: @test_bypass1(
    ; CHECK-NEXT: %V1 = load i8, i8* %P
    ; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
    ; CHECK-NEXT: ret i8 0
  
-
    %V1 = load i8, i8* %P
    %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
    %V2 = load i8, i8* %P
@@ -22,13 +21,12 @@ define i8 @test1(i8 *%P) {
  
  
  ; Trivial Store->load forwarding over invariant.start
-define i8 @test2(i8 *%P) {
-  ; CHECK-LABEL: @test2(
+define i8 @test_bypass2(i8 *%P) {
+  ; CHECK-LABEL: @test_bypass2(
    ; CHECK-NEXT: store i8 42, i8* %P
    ; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
    ; CHECK-NEXT: ret i8 42
  
-
    store i8 42, i8* %P
    %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
    %V1 = load i8, i8* %P
@@ -38,13 +36,11 @@ define i8 @test2(i8 *%P) {
  ; We can DSE over invariant.start calls, since the first store to
  ; %P is valid, and the second store is actually unreachable based on semantics
  ; of invariant.start.
-define void @test3(i8* %P) {
-
-; CHECK-LABEL: @test3(
+define void @test_bypass3(i8* %P) {
+; CHECK-LABEL: @test_bypass3(
  ; CHECK-NEXT:  %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
  ; CHECK-NEXT:  store i8 60, i8* %P
  
-
    store i8 50, i8* %P
    %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
    store i8 60, i8* %P
@@ -54,9 +50,9 @@ define void @test3(i8* %P) {
  
  ; FIXME: Now the first store can actually be eliminated, since there is no read within
  ; the invariant region, between start and end.
-define void @test4(i8* %P) {
+define void @test_bypass4(i8* %P) {
  
-; CHECK-LABEL: @test4(
+; CHECK-LABEL: @test_bypass4(
  ; CHECK-NEXT: store i8 50, i8* %P
  ; CHECK-NEXT:  %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P)
  ; CHECK-NEXT: call void @llvm.invariant.end.p0i8({}* %i, i64 1, i8* %P)
@@ -69,3 +65,214 @@ define void @test4(i8* %P) {
    store i8 60, i8* %P
    ret void
  }
+
+
+declare void @clobber()
+declare {}* @llvm.invariant.start.p0i32(i64 %size, i32* nocapture %ptr)
+declare void @llvm.invariant.end.p0i32({}*, i64, i32* nocapture) nounwind
+
+define i32 @test_before_load(i32* %p) {
+; CHECK-LABEL: @test_before_load
+; CHECK: ret i32 0
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_before_clobber(i32* %p) {
+; CHECK-LABEL: @test_before_clobber
+; CHECK: ret i32 0
+  %v1 = load i32, i32* %p
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_unanalzyable_load(i32* %p) {
+; CHECK-LABEL: @test_unanalzyable_load
+; CHECK: ret i32 0
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  call void @clobber()
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_negative_after_clobber(i32* %p) {
+; CHECK-LABEL: @test_negative_after_clobber
+; CHECK: ret i32 %sub
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_merge(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_merge
+; CHECK: ret i32 0
+  %v1 = load i32, i32* %p
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  br i1 %cnd, label %merge, label %taken
+
+taken:
+  call void @clobber()
+  br label %merge
+merge:
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_negative_after_mergeclobber(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_negative_after_mergeclobber
+; CHECK: ret i32 %sub
+  %v1 = load i32, i32* %p
+  br i1 %cnd, label %merge, label %taken
+
+taken:
+  call void @clobber()
+  br label %merge
+merge:
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+; In theory, this version could work, but earlycse is incapable of
+; merging facts along distinct paths.  
+define i32 @test_false_negative_merge(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_false_negative_merge
+; CHECK: ret i32 %sub
+  %v1 = load i32, i32* %p
+  br i1 %cnd, label %merge, label %taken
+
+taken:
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  call void @clobber()
+  br label %merge
+merge:
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_merge_unanalyzable_load(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_merge_unanalyzable_load
+; CHECK: ret i32 0
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  call void @clobber()
+  %v1 = load i32, i32* %p
+  br i1 %cnd, label %merge, label %taken
+
+taken:
+  call void @clobber()
+  br label %merge
+merge:
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define void @test_dse_before_load(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_dse_before_load
+; CHECK-NOT: store
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  store i32 %v1, i32* %p
+  ret void
+}
+
+define void @test_dse_after_load(i32* %p, i1 %cnd) {
+; CHECK-LABEL: @test_dse_after_load
+; CHECK-NOT: store
+  %v1 = load i32, i32* %p
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  call void @clobber()
+  store i32 %v1, i32* %p
+  ret void
+}
+
+
+; In this case, we have a false negative since MemoryLocation is implicitly
+; typed due to the user of a Value to represent the address.  Note that other
+; passes will canonicalize away the bitcasts in this example.
+define i32 @test_false_negative_types(i32* %p) {
+; CHECK-LABEL: @test_false_negative_types
+; CHECK: ret i32 %sub
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  %pf = bitcast i32* %p to float*
+  %v2f = load float, float* %pf
+  %v2 = bitcast float %v2f to i32
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_negative_size1(i32* %p) {
+; CHECK-LABEL: @test_negative_size1
+; CHECK: ret i32 %sub
+  call {}* @llvm.invariant.start.p0i32(i64 3, i32* %p)
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_negative_size2(i32* %p) {
+; CHECK-LABEL: @test_negative_size2
+; CHECK: ret i32 %sub
+  call {}* @llvm.invariant.start.p0i32(i64 0, i32* %p)
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_negative_scope(i32* %p) {
+; CHECK-LABEL: @test_negative_scope
+; CHECK: ret i32 %sub
+  %scope = call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  call void @llvm.invariant.end.p0i32({}* %scope, i64 4, i32* %p)
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+define i32 @test_false_negative_scope(i32* %p) {
+; CHECK-LABEL: @test_false_negative_scope
+; CHECK: ret i32 %sub
+  %scope = call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p)
+  %v1 = load i32, i32* %p
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  call void @llvm.invariant.end.p0i32({}* %scope, i64 4, i32* %p)
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
+
+; Invariant load defact starts an invariant.start scope of the appropriate size
+define i32 @test_invariant_load_scope(i32* %p) {
+; CHECK-LABEL: @test_invariant_load_scope
+; CHECK: ret i32 0
+  %v1 = load i32, i32* %p, !invariant.load !{}
+  call void @clobber()
+  %v2 = load i32, i32* %p
+  %sub = sub i32 %v1, %v2
+  ret i32 %sub
+}
author	Philip Reames <listmail@philipreames.com>
	Wed, 14 Mar 2018 21:35:06 +0000 (21:35 +0000)
committer	Philip Reames <listmail@philipreames.com>
	Wed, 14 Mar 2018 21:35:06 +0000 (21:35 +0000)
llvm/lib/Transforms/Scalar/EarlyCSE.cpp		patch \| blob \| history
llvm/test/Transforms/EarlyCSE/invariant-loads.ll		patch \| blob \| history
llvm/test/Transforms/EarlyCSE/invariant.start.ll		patch \| blob \| history