From f171149e0d541ca7da7af5fe59bd6d9a77267d24 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Thu, 5 Aug 2021 15:37:19 +0100
Subject: [PATCH] [SimpifyCFG] Speculate a store preceded by a local
 non-escaping load

In SimplifyCFG we may simplify the CFG by speculatively executing
certain stores, when they are preceded by a store to the same
location.  This patch allows such speculation also when the stores are
similarly preceded by a load.

In order for this transformation to be correct we need to ensure that
the memory location is writable and the store in the new location does
not introduce a data race.

Local objects (created by an `alloca` instruction) are always
writable, so once we are past a read from a location it is valid to
also write to that same location.

Seeing just a load does not guarantee absence of a data race (unlike
if we see a store) - the load may still be part of a race, just not
causing undefined behaviour
(cf. https://llvm.org/docs/Atomics.html#optimization-outside-atomic).

In the original program, a data race might have been prevented by the
condition, but once we move the store outside the condition, we must
be sure a data race wasn't possible anyway, no matter what the
condition evaluates to.

One way to be sure that a local object is never concurrently
read/written is check that its address never escapes the function.

Hence this transformation is restricted to local, non-escaping
objects.

Reviewed By: nikic, lebedev.ri

Differential Revision: https://reviews.llvm.org/D107281
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp          |  18 +++
 .../test/Transforms/SimplifyCFG/speculate-store.ll | 139 +++++++++++++++++++++
 2 files changed, 157 insertions(+)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 0e39284..a72af2b 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/GuardUtils.h"
@@ -2250,6 +2251,23 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
         return SI->getValueOperand();
       return nullptr; // Unknown store.
     }
+
+    if (auto *LI = dyn_cast<LoadInst>(&CurI)) {
+      if (LI->getPointerOperand() == StorePtr && LI->getType() == StoreTy &&
+          LI->isSimple()) {
+        // Local objects (created by an `alloca` instruction) are always
+        // writable, so once we are past a read from a location it is valid to
+        // also write to that same location.
+        // If the address of the local object never escapes the function, that
+        // means it's never concurrently read or written, hence moving the store
+        // from under the condition will not introduce a data race.
+        auto *AI = dyn_cast<AllocaInst>(getUnderlyingObject(StorePtr));
+        if (AI && !PointerMayBeCaptured(AI, false, true))
+          // Found a previous load, return it.
+          return LI;
+      }
+      // The load didn't work out, but we may still find a store.
+    }
   }
 
   return nullptr;
diff --git a/llvm/test/Transforms/SimplifyCFG/speculate-store.ll b/llvm/test/Transforms/SimplifyCFG/speculate-store.ll
index 8ceba7d..0e44786 100644
--- a/llvm/test/Transforms/SimplifyCFG/speculate-store.ll
+++ b/llvm/test/Transforms/SimplifyCFG/speculate-store.ll
@@ -175,6 +175,145 @@ ret.end:
   ret void
 }
 
+;; Speculate a store, preceded by a local, non-escaping load
+define i32 @load_before_store_noescape(i64 %i, i32 %b)  {
+; CHECK-LABEL: @load_before_store_noescape(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [2 x i32], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[A]] to i64*
+; CHECK-NEXT:    store i64 4294967296, i64* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 [[I:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[CMP]], i32 [[B]], i32 [[TMP1]]
+; CHECK-NEXT:    store i32 [[SPEC_STORE_SELECT]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  %a = alloca [2 x i32], align 8
+  %0 = bitcast [2 x i32]* %a to i64*
+  store i64 4294967296, i64* %0, align 8
+  %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 %i
+  %1 = load i32, i32* %arrayidx, align 4
+  %cmp = icmp slt i32 %1, %b
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 %b, i32* %arrayidx, align 4
+  br label %if.end
+
+if.end:
+  %arrayidx1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0
+  %2 = load i32, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1
+  %3 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %2, %3
+  ret i32 %add
+}
+
+;; Don't speculate a store, preceded by a local, escaping load
+define i32 @load_before_store_escape(i64 %i, i32 %b)  {
+; CHECK-LABEL: @load_before_store_escape(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [2 x i32], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[A]] to i64*
+; CHECK-NEXT:    store i64 4294967296, i64* [[TMP0]], align 8
+; CHECK-NEXT:    call void @fork_some_threads([2 x i32]* [[A]])
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 [[I:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 [[B]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    call void @join_some_threads()
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  %a = alloca [2 x i32], align 8
+  %0 = bitcast [2 x i32]* %a to i64*
+  store i64 4294967296, i64* %0, align 8
+  call void @fork_some_threads([2 x i32]* %a)
+  %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 %i
+  %1 = load i32, i32* %arrayidx, align 4
+  %cmp = icmp slt i32 %1, %b
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 %b, i32* %arrayidx, align 4
+  br label %if.end
+
+if.end:
+  %arrayidx1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0
+  %2 = load i32, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1
+  %3 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %2, %3
+  call void @join_some_threads()
+  ret i32 %add
+}
+
+declare void @fork_some_threads([2 x i32] *);
+declare void @join_some_threads();
+
+; Don't speculate if it's not the only instruction in the block (not counting
+; the terminator)
+define i32 @not_alone_in_block(i64 %i, i32 %b)  {
+; CHECK-LABEL: @not_alone_in_block(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [2 x i32], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [2 x i32]* [[A]] to i64*
+; CHECK-NEXT:    store i64 4294967296, i64* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 [[I:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 [[B]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i32 [[B]], i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  %a = alloca [2 x i32], align 8
+  %0 = bitcast [2 x i32]* %a to i64*
+  store i64 4294967296, i64* %0, align 8
+  %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 %i
+  %arrayidx1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %cmp = icmp slt i32 %1, %b
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 %b, i32* %arrayidx, align 4
+  store i32 %b, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:
+  %2 = load i32, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1
+  %3 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %2, %3
+  ret i32 %add
+}
+
 ; CHECK: !0 = !{!"branch_weights", i32 3, i32 5}
 !0 = !{!"branch_weights", i32 3, i32 5}
 
-- 
2.7.4