From c9821abfc023fba684c8ef8589c49cba8083f579 Mon Sep 17 00:00:00 2001
From: "Nadeem, Usman" <mnadeem@quicinc.com>
Date: Mon, 23 Jan 2023 11:13:58 -0800
Subject: [PATCH] [WoA] Use fences for sequentially consistent stores/writes

LLVM currently uses LDAR/STLR and variants for acquire/release
as well as seq_cst operations. This is fine as long as all code uses
this convention.

Normally LDAR/STLR act as one way barriers but when used in
combination they provide a sequentially consistent model. i.e.
when an LDAR appears after an STLR in program order the STLR
acts as a two way fence and the store will be observed before
the load.

The problem is that normal loads (unlike ldar), when they appear
after the STLR can be observed before STLR (if my understanding
is correct). Possibly providing weaker than expected guarantees if
they are used for ordered atomic operations.

Unfortunately in Microsoft Visual Studio STL seq_cst ld/st are
implemented using normal load/stores and explicit fences:
dmb ish + str + dmb ish
ldr + dmb ish

This patch uses fences for MSVC target whenever we write to the
memory in a sequentially consistent way so that we don't rely on
the assumptions that just using LDAR/STLR will give us sequentially
consistent ordering.

Differential Revision: https://reviews.llvm.org/D141748

Change-Id: I48f3500ff8ec89677c9f089ce58181bd139bc68a
---
 llvm/include/llvm/CodeGen/TargetLowering.h      |  7 +++++++
 llvm/lib/CodeGen/AtomicExpandPass.cpp           | 21 ++++++++++++++++++-
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 24 +++++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h   |  2 ++
 llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll    | 28 ++++++++++++++++++-------
 5 files changed, 73 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8317a2e..639d48e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1983,6 +1983,13 @@ public:
     return false;
   }
 
+  /// Whether AtomicExpandPass should automatically insert a trailing fence
+  /// without reducing the ordering for this atomic. Defaults to false.
+  virtual bool
+  shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const {
+    return false;
+  }
+
   /// Perform a load-linked operation on Addr, returning a "Value *" with the
   /// corresponding pointee type. This may entail some non-trivial operations to
   /// truncate or reconstruct types that will be illegal in the backend. See
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 64b72e9..c8bea66 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -289,6 +289,24 @@ bool AtomicExpand::runOnFunction(Function &F) {
       if (FenceOrdering != AtomicOrdering::Monotonic) {
         MadeChange |= bracketInstWithFences(I, FenceOrdering);
       }
+    } else if (I->hasAtomicStore() &&
+               TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
+      auto FenceOrdering = AtomicOrdering::Monotonic;
+      if (SI)
+        FenceOrdering = SI->getOrdering();
+      else if (RMWI)
+        FenceOrdering = RMWI->getOrdering();
+      else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI) !=
+                           TargetLoweringBase::AtomicExpansionKind::LLSC)
+        // LLSC is handled in expandAtomicCmpXchg().
+        FenceOrdering = CASI->getSuccessOrdering();
+
+      IRBuilder Builder(I);
+      if (auto TrailingFence =
+              TLI->emitTrailingFence(Builder, I, FenceOrdering)) {
+        TrailingFence->moveAfter(I);
+        MadeChange = true;
+      }
     }
 
     if (LI)
@@ -1356,7 +1374,8 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   // Make sure later instructions don't get reordered with a fence if
   // necessary.
   Builder.SetInsertPoint(SuccessBB);
-  if (ShouldInsertFencesForAtomic)
+  if (ShouldInsertFencesForAtomic ||
+      TLI->shouldInsertTrailingFenceForAtomicStore(CI))
     TLI->emitTrailingFence(Builder, CI, SuccessOrder);
   Builder.CreateBr(ExitBB);
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 95c9db7..abe4ef7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22300,6 +22300,30 @@ bool AArch64TargetLowering::shouldInsertFencesForAtomic(
   return isOpSuitableForLDPSTP(I);
 }
 
+bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
+    const Instruction *I) const {
+  // Store-Release instructions only provide seq_cst guarantees when paired with
+  // Load-Acquire instructions. MSVC CRT does not use these instructions to
+  // implement seq_cst loads and stores, so we need additional explicit fences
+  // after memory writes.
+  if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+    return false;
+
+  switch (I->getOpcode()) {
+  default:
+    return false;
+  case Instruction::AtomicCmpXchg:
+    return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
+           AtomicOrdering::SequentiallyConsistent;
+  case Instruction::AtomicRMW:
+    return cast<AtomicRMWInst>(I)->getOrdering() ==
+           AtomicOrdering::SequentiallyConsistent;
+  case Instruction::Store:
+    return cast<StoreInst>(I)->getOrdering() ==
+           AtomicOrdering::SequentiallyConsistent;
+  }
+}
+
 // Loads and stores less than 128-bits are already atomic; ones above that
 // are doomed anyway, so defer to the default libcall and blame the OS when
 // things go wrong.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 99965ce..0edec72 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -706,6 +706,8 @@ public:
 
   bool isOpSuitableForLDPSTP(const Instruction *I) const;
   bool shouldInsertFencesForAtomic(const Instruction *I) const override;
+  bool
+  shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override;
 
   TargetLoweringBase::AtomicExpansionKind
   shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
diff --git a/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll b/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll
index 73dedfe..762e70a 100644
--- a/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll
+++ b/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll
@@ -19,6 +19,7 @@ define dso_local i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w11, .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw add ptr @var8, i8 %offset seq_cst
    ret i8 %old
@@ -135,16 +136,17 @@ define dso_local i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
 define dso_local i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_sub_i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    adrp x9, var64
 ; CHECK-NEXT:    add x9, x9, :lo12:var64
 ; CHECK-NEXT:  .LBB7_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldaxr x8, [x9]
-; CHECK-NEXT:    sub x10, x8, x0
+; CHECK-NEXT:    ldaxr x0, [x9]
+; CHECK-NEXT:    sub x10, x0, x8
 ; CHECK-NEXT:    stlxr w11, x10, [x9]
 ; CHECK-NEXT:    cbnz w11, .LBB7_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw sub ptr @var64, i64 %offset seq_cst
    ret i64 %old
@@ -199,6 +201,7 @@ define dso_local i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w11, .LBB10_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw and ptr @var32, i32 %offset seq_cst
    ret i32 %old
@@ -235,6 +238,7 @@ define dso_local i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w11, .LBB12_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw or ptr @var8, i8 %offset seq_cst
    ret i8 %old
@@ -343,6 +347,7 @@ define dso_local i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w11, .LBB18_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw xor ptr @var32, i32 %offset seq_cst
    ret i32 %old
@@ -397,6 +402,7 @@ define dso_local i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w10, .LBB21_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw xchg ptr @var16, i16 %offset seq_cst
    ret i16 %old
@@ -500,17 +506,18 @@ define dso_local i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
 define dso_local i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_min_i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    adrp x9, var64
 ; CHECK-NEXT:    add x9, x9, :lo12:var64
 ; CHECK-NEXT:  .LBB27_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldaxr x8, [x9]
-; CHECK-NEXT:    cmp x8, x0
-; CHECK-NEXT:    csel x10, x8, x0, le
+; CHECK-NEXT:    ldaxr x0, [x9]
+; CHECK-NEXT:    cmp x0, x8
+; CHECK-NEXT:    csel x10, x0, x8, le
 ; CHECK-NEXT:    stlxr w11, x10, [x9]
 ; CHECK-NEXT:    cbnz w11, .LBB27_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw min ptr @var64, i64 %offset seq_cst
    ret i64 %old
@@ -531,6 +538,7 @@ define dso_local i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w11, .LBB28_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw max ptr @var8, i8 %offset seq_cst
    ret i8 %old
@@ -648,6 +656,7 @@ define dso_local i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w11, .LBB34_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw umin ptr @var32, i32 %offset seq_cst
    ret i32 %old
@@ -726,6 +735,7 @@ define dso_local i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w11, .LBB38_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw umax ptr @var32, i32 %offset seq_cst
    ret i32 %old
@@ -794,7 +804,8 @@ define dso_local i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 ; CHECK-NEXT:    // in Loop: Header=BB41_1 Depth=1
 ; CHECK-NEXT:    stlxrh w10, w1, [x9]
 ; CHECK-NEXT:    cbnz w10, .LBB41_1
-; CHECK-NEXT:  // %bb.3: // %cmpxchg.end
+; CHECK-NEXT:  // %bb.3: // %cmpxchg.success
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB41_4: // %cmpxchg.nostore
@@ -972,6 +983,7 @@ define dso_local void @test_atomic_store_seq_cst_i8(i8 %val) nounwind {
 ; CHECK-NEXT:    adrp x8, var8
 ; CHECK-NEXT:    add x8, x8, :lo12:var8
 ; CHECK-NEXT:    stlrb w0, [x8]
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
   store atomic i8 %val, ptr @var8 seq_cst, align 1
   ret void
-- 
2.7.4