From f171149e0d541ca7da7af5fe59bd6d9a77267d24 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Thu, 5 Aug 2021 15:37:19 +0100 Subject: [PATCH] [SimpifyCFG] Speculate a store preceded by a local non-escaping load In SimplifyCFG we may simplify the CFG by speculatively executing certain stores, when they are preceded by a store to the same location. This patch allows such speculation also when the stores are similarly preceded by a load. In order for this transformation to be correct we need to ensure that the memory location is writable and the store in the new location does not introduce a data race. Local objects (created by an `alloca` instruction) are always writable, so once we are past a read from a location it is valid to also write to that same location. Seeing just a load does not guarantee absence of a data race (unlike if we see a store) - the load may still be part of a race, just not causing undefined behaviour (cf. https://llvm.org/docs/Atomics.html#optimization-outside-atomic). In the original program, a data race might have been prevented by the condition, but once we move the store outside the condition, we must be sure a data race wasn't possible anyway, no matter what the condition evaluates to. One way to be sure that a local object is never concurrently read/written is check that its address never escapes the function. Hence this transformation is restricted to local, non-escaping objects. Reviewed By: nikic, lebedev.ri Differential Revision: https://reviews.llvm.org/D107281 --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 18 +++ .../test/Transforms/SimplifyCFG/speculate-store.ll | 139 +++++++++++++++++++++ 2 files changed, 157 insertions(+) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 0e39284..a72af2b 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/GuardUtils.h" @@ -2250,6 +2251,23 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, return SI->getValueOperand(); return nullptr; // Unknown store. } + + if (auto *LI = dyn_cast(&CurI)) { + if (LI->getPointerOperand() == StorePtr && LI->getType() == StoreTy && + LI->isSimple()) { + // Local objects (created by an `alloca` instruction) are always + // writable, so once we are past a read from a location it is valid to + // also write to that same location. + // If the address of the local object never escapes the function, that + // means it's never concurrently read or written, hence moving the store + // from under the condition will not introduce a data race. + auto *AI = dyn_cast(getUnderlyingObject(StorePtr)); + if (AI && !PointerMayBeCaptured(AI, false, true)) + // Found a previous load, return it. + return LI; + } + // The load didn't work out, but we may still find a store. + } } return nullptr; diff --git a/llvm/test/Transforms/SimplifyCFG/speculate-store.ll b/llvm/test/Transforms/SimplifyCFG/speculate-store.ll index 8ceba7d..0e44786 100644 --- a/llvm/test/Transforms/SimplifyCFG/speculate-store.ll +++ b/llvm/test/Transforms/SimplifyCFG/speculate-store.ll @@ -175,6 +175,145 @@ ret.end: ret void } +;; Speculate a store, preceded by a local, non-escaping load +define i32 @load_before_store_noescape(i64 %i, i32 %b) { +; CHECK-LABEL: @load_before_store_noescape( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca [2 x i32], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast [2 x i32]* [[A]] to i64* +; CHECK-NEXT: store i64 4294967296, i64* [[TMP0]], align 8 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 [[I:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = select i1 [[CMP]], i32 [[B]], i32 [[TMP1]] +; CHECK-NEXT: store i32 [[SPEC_STORE_SELECT]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret i32 [[ADD]] +; +entry: + %a = alloca [2 x i32], align 8 + %0 = bitcast [2 x i32]* %a to i64* + store i64 4294967296, i64* %0, align 8 + %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 %i + %1 = load i32, i32* %arrayidx, align 4 + %cmp = icmp slt i32 %1, %b + br i1 %cmp, label %if.then, label %if.end + +if.then: + store i32 %b, i32* %arrayidx, align 4 + br label %if.end + +if.end: + %arrayidx1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0 + %2 = load i32, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1 + %3 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %2, %3 + ret i32 %add +} + +;; Don't speculate a store, preceded by a local, escaping load +define i32 @load_before_store_escape(i64 %i, i32 %b) { +; CHECK-LABEL: @load_before_store_escape( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca [2 x i32], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast [2 x i32]* [[A]] to i64* +; CHECK-NEXT: store i64 4294967296, i64* [[TMP0]], align 8 +; CHECK-NEXT: call void @fork_some_threads([2 x i32]* [[A]]) +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 [[I:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store i32 [[B]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: call void @join_some_threads() +; CHECK-NEXT: ret i32 [[ADD]] +; +entry: + %a = alloca [2 x i32], align 8 + %0 = bitcast [2 x i32]* %a to i64* + store i64 4294967296, i64* %0, align 8 + call void @fork_some_threads([2 x i32]* %a) + %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 %i + %1 = load i32, i32* %arrayidx, align 4 + %cmp = icmp slt i32 %1, %b + br i1 %cmp, label %if.then, label %if.end + +if.then: + store i32 %b, i32* %arrayidx, align 4 + br label %if.end + +if.end: + %arrayidx1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0 + %2 = load i32, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1 + %3 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %2, %3 + call void @join_some_threads() + ret i32 %add +} + +declare void @fork_some_threads([2 x i32] *); +declare void @join_some_threads(); + +; Don't speculate if it's not the only instruction in the block (not counting +; the terminator) +define i32 @not_alone_in_block(i64 %i, i32 %b) { +; CHECK-LABEL: @not_alone_in_block( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca [2 x i32], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast [2 x i32]* [[A]] to i64* +; CHECK-NEXT: store i64 4294967296, i64* [[TMP0]], align 8 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 [[I:%.*]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store i32 [[B]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: store i32 [[B]], i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret i32 [[ADD]] +; +entry: + %a = alloca [2 x i32], align 8 + %0 = bitcast [2 x i32]* %a to i64* + store i64 4294967296, i64* %0, align 8 + %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 %i + %arrayidx1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %cmp = icmp slt i32 %1, %b + br i1 %cmp, label %if.then, label %if.end + +if.then: + store i32 %b, i32* %arrayidx, align 4 + store i32 %b, i32* %arrayidx1, align 4 + br label %if.end + +if.end: + %2 = load i32, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1 + %3 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %2, %3 + ret i32 %add +} + ; CHECK: !0 = !{!"branch_weights", i32 3, i32 5} !0 = !{!"branch_weights", i32 3, i32 5} -- 2.7.4