From 10d34f5538e0d231a172608bdb7d08824771c7c7 Mon Sep 17 00:00:00 2001 From: Tim Northover Date: Mon, 20 Sep 2021 14:19:14 +0100 Subject: [PATCH] AArch64: use CAS instead of LDXR/STXR if available This covers 128-bit loads, and atomicrmw operations without a single native instruction. Using CAS saves has a better chance of succeeding with high contention on some systems. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 11 +- .../CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll | 13 +- .../CodeGen/AArch64/GlobalISel/arm64-atomic.ll | 28 +- llvm/test/CodeGen/AArch64/arm64-atomic-128.ll | 777 ++++++++++++++++----- llvm/test/CodeGen/AArch64/atomic-ops-lse.ll | 3 +- llvm/test/CodeGen/AArch64/atomicrmw-xchg-fp.ll | 16 +- 6 files changed, 641 insertions(+), 207 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index bad7a1b..3b5203f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21903,7 +21903,10 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { if (getTargetMachine().getOptLevel() == CodeGenOpt::None) return AtomicExpansionKind::CmpXChg; - return AtomicExpansionKind::LLSC; + // Using CAS for an atomic load has a better chance of succeeding under high + // contention situations. So use it if available. + return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::LLSC; } // For the real atomic operations, we have ldxr/stxr up to 128 bits, @@ -21940,8 +21943,10 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { // implement atomicrmw without spilling. If the target address is also on the // stack and close enough to the spill slot, this can lead to a situation // where the monitor always gets cleared and the atomic operation can never - // succeed. So at -O0 lower this operation to a CAS loop. - if (getTargetMachine().getOptLevel() == CodeGenOpt::None) + // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if + // we have a single CAS instruction that can replace the loop. + if (getTargetMachine().getOptLevel() == CodeGenOpt::None || + Subtarget->hasLSE()) return AtomicExpansionKind::CmpXChg; return AtomicExpansionKind::LLSC; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll index dec4318..a3d8531 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll @@ -360,14 +360,11 @@ define void @atomic_load_relaxed(i64, i64, ptr %p, ptr %p2) { ; ; CHECK-CAS-O1-LABEL: atomic_load_relaxed: ; CHECK-CAS-O1: // %bb.0: -; CHECK-CAS-O1-NEXT: .LBB4_1: // %atomicrmw.start -; CHECK-CAS-O1-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-CAS-O1-NEXT: ldxp x9, x8, [x2] -; CHECK-CAS-O1-NEXT: stxp w10, x9, x8, [x2] -; CHECK-CAS-O1-NEXT: cbnz w10, .LBB4_1 -; CHECK-CAS-O1-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-CAS-O1-NEXT: mov v0.d[0], x9 -; CHECK-CAS-O1-NEXT: mov v0.d[1], x8 +; CHECK-CAS-O1-NEXT: mov x0, xzr +; CHECK-CAS-O1-NEXT: mov x1, xzr +; CHECK-CAS-O1-NEXT: casp x0, x1, x0, x1, [x2] +; CHECK-CAS-O1-NEXT: mov v0.d[0], x0 +; CHECK-CAS-O1-NEXT: mov v0.d[1], x1 ; CHECK-CAS-O1-NEXT: str q0, [x3] ; CHECK-CAS-O1-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll index d26ac80..10d6abc 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll @@ -362,15 +362,17 @@ define i32 @fetch_and_nand(ptr %p) #0 { ; ; CHECK-LSE-O1-LABEL: fetch_and_nand: ; CHECK-LSE-O1: ; %bb.0: +; CHECK-LSE-O1-NEXT: mov x8, x0 +; CHECK-LSE-O1-NEXT: ldr w0, [x0] ; CHECK-LSE-O1-NEXT: LBB6_1: ; %atomicrmw.start ; CHECK-LSE-O1-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-LSE-O1-NEXT: ldxr w8, [x0] -; CHECK-LSE-O1-NEXT: and w9, w8, #0x7 -; CHECK-LSE-O1-NEXT: mvn w9, w9 -; CHECK-LSE-O1-NEXT: stlxr w10, w9, [x0] -; CHECK-LSE-O1-NEXT: cbnz w10, LBB6_1 +; CHECK-LSE-O1-NEXT: mov x9, x0 +; CHECK-LSE-O1-NEXT: and w10, w0, #0x7 +; CHECK-LSE-O1-NEXT: mvn w10, w10 +; CHECK-LSE-O1-NEXT: casl w0, w10, [x8] +; CHECK-LSE-O1-NEXT: cmp w0, w9 +; CHECK-LSE-O1-NEXT: b.ne LBB6_1 ; CHECK-LSE-O1-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-LSE-O1-NEXT: mov x0, x8 ; CHECK-LSE-O1-NEXT: ret ; ; CHECK-LSE-O0-LABEL: fetch_and_nand: @@ -455,15 +457,17 @@ define i64 @fetch_and_nand_64(ptr %p) #0 { ; ; CHECK-LSE-O1-LABEL: fetch_and_nand_64: ; CHECK-LSE-O1: ; %bb.0: +; CHECK-LSE-O1-NEXT: mov x8, x0 +; CHECK-LSE-O1-NEXT: ldr x0, [x0] ; CHECK-LSE-O1-NEXT: LBB7_1: ; %atomicrmw.start ; CHECK-LSE-O1-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-LSE-O1-NEXT: ldaxr x8, [x0] -; CHECK-LSE-O1-NEXT: and x9, x8, #0x7 -; CHECK-LSE-O1-NEXT: mvn x9, x9 -; CHECK-LSE-O1-NEXT: stlxr w10, x9, [x0] -; CHECK-LSE-O1-NEXT: cbnz w10, LBB7_1 +; CHECK-LSE-O1-NEXT: mov x9, x0 +; CHECK-LSE-O1-NEXT: and x10, x0, #0x7 +; CHECK-LSE-O1-NEXT: mvn x10, x10 +; CHECK-LSE-O1-NEXT: casal x0, x10, [x8] +; CHECK-LSE-O1-NEXT: cmp x0, x9 +; CHECK-LSE-O1-NEXT: b.ne LBB7_1 ; CHECK-LSE-O1-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-LSE-O1-NEXT: mov x0, x8 ; CHECK-LSE-O1-NEXT: ret ; ; CHECK-LSE-O0-LABEL: fetch_and_nand_64: diff --git a/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll index 65106b92..c5884ae 100644 --- a/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll +++ b/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone | FileCheck %s -check-prefixes=CHECK,NOOUTLINE -; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone -mattr=+outline-atomics | FileCheck %s -check-prefixes=CHECK,OUTLINE -; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone -mattr=+lse | FileCheck %s -check-prefixes=CHECK,LSE +; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone | FileCheck %s -check-prefixes=NOOUTLINE +; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone -mattr=+outline-atomics | FileCheck %s -check-prefixes=OUTLINE +; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone -mattr=+lse | FileCheck %s -check-prefixes=LSE @var = global i128 0 @@ -210,22 +210,62 @@ define i128 @val_compare_and_swap_monotonic(i128* %p, i128 %oldval, i128 %newval } define void @fetch_and_nand(i128* %p, i128 %bits) { -; CHECK-LABEL: fetch_and_nand: -; CHECK: // %bb.0: -; CHECK-NEXT: .LBB4_1: // %atomicrmw.start -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldxp x9, x8, [x0] -; CHECK-NEXT: and x10, x9, x2 -; CHECK-NEXT: and x11, x8, x3 -; CHECK-NEXT: mvn x11, x11 -; CHECK-NEXT: mvn x10, x10 -; CHECK-NEXT: stlxp w12, x10, x11, [x0] -; CHECK-NEXT: cbnz w12, .LBB4_1 -; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: adrp x10, :got:var -; CHECK-NEXT: ldr x10, [x10, :got_lo12:var] -; CHECK-NEXT: stp x9, x8, [x10] -; CHECK-NEXT: ret +; NOOUTLINE-LABEL: fetch_and_nand: +; NOOUTLINE: // %bb.0: +; NOOUTLINE-NEXT: .LBB4_1: // %atomicrmw.start +; NOOUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOOUTLINE-NEXT: ldxp x9, x8, [x0] +; NOOUTLINE-NEXT: and x10, x9, x2 +; NOOUTLINE-NEXT: and x11, x8, x3 +; NOOUTLINE-NEXT: mvn x11, x11 +; NOOUTLINE-NEXT: mvn x10, x10 +; NOOUTLINE-NEXT: stlxp w12, x10, x11, [x0] +; NOOUTLINE-NEXT: cbnz w12, .LBB4_1 +; NOOUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; NOOUTLINE-NEXT: adrp x10, :got:var +; NOOUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; NOOUTLINE-NEXT: stp x9, x8, [x10] +; NOOUTLINE-NEXT: ret +; +; OUTLINE-LABEL: fetch_and_nand: +; OUTLINE: // %bb.0: +; OUTLINE-NEXT: .LBB4_1: // %atomicrmw.start +; OUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; OUTLINE-NEXT: ldxp x9, x8, [x0] +; OUTLINE-NEXT: and x10, x9, x2 +; OUTLINE-NEXT: and x11, x8, x3 +; OUTLINE-NEXT: mvn x11, x11 +; OUTLINE-NEXT: mvn x10, x10 +; OUTLINE-NEXT: stlxp w12, x10, x11, [x0] +; OUTLINE-NEXT: cbnz w12, .LBB4_1 +; OUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; OUTLINE-NEXT: adrp x10, :got:var +; OUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; OUTLINE-NEXT: stp x9, x8, [x10] +; OUTLINE-NEXT: ret +; +; LSE-LABEL: fetch_and_nand: +; LSE: // %bb.0: +; LSE-NEXT: ldp x4, x5, [x0] +; LSE-NEXT: .LBB4_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov x7, x5 +; LSE-NEXT: mov x6, x4 +; LSE-NEXT: and x8, x7, x3 +; LSE-NEXT: and x9, x4, x2 +; LSE-NEXT: mvn x10, x9 +; LSE-NEXT: mvn x11, x8 +; LSE-NEXT: mov x4, x6 +; LSE-NEXT: mov x5, x7 +; LSE-NEXT: caspl x4, x5, x10, x11, [x0] +; LSE-NEXT: cmp x5, x7 +; LSE-NEXT: ccmp x4, x6, #0, eq +; LSE-NEXT: b.ne .LBB4_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: adrp x8, :got:var +; LSE-NEXT: ldr x8, [x8, :got_lo12:var] +; LSE-NEXT: stp x4, x5, [x8] +; LSE-NEXT: ret %val = atomicrmw nand i128* %p, i128 %bits release store i128 %val, i128* @var, align 16 @@ -233,20 +273,56 @@ define void @fetch_and_nand(i128* %p, i128 %bits) { } define void @fetch_and_or(i128* %p, i128 %bits) { -; CHECK-LABEL: fetch_and_or: -; CHECK: // %bb.0: -; CHECK-NEXT: .LBB5_1: // %atomicrmw.start -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxp x9, x8, [x0] -; CHECK-NEXT: orr x10, x8, x3 -; CHECK-NEXT: orr x11, x9, x2 -; CHECK-NEXT: stlxp w12, x11, x10, [x0] -; CHECK-NEXT: cbnz w12, .LBB5_1 -; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: adrp x10, :got:var -; CHECK-NEXT: ldr x10, [x10, :got_lo12:var] -; CHECK-NEXT: stp x9, x8, [x10] -; CHECK-NEXT: ret +; NOOUTLINE-LABEL: fetch_and_or: +; NOOUTLINE: // %bb.0: +; NOOUTLINE-NEXT: .LBB5_1: // %atomicrmw.start +; NOOUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOOUTLINE-NEXT: ldaxp x9, x8, [x0] +; NOOUTLINE-NEXT: orr x10, x8, x3 +; NOOUTLINE-NEXT: orr x11, x9, x2 +; NOOUTLINE-NEXT: stlxp w12, x11, x10, [x0] +; NOOUTLINE-NEXT: cbnz w12, .LBB5_1 +; NOOUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; NOOUTLINE-NEXT: adrp x10, :got:var +; NOOUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; NOOUTLINE-NEXT: stp x9, x8, [x10] +; NOOUTLINE-NEXT: ret +; +; OUTLINE-LABEL: fetch_and_or: +; OUTLINE: // %bb.0: +; OUTLINE-NEXT: .LBB5_1: // %atomicrmw.start +; OUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; OUTLINE-NEXT: ldaxp x9, x8, [x0] +; OUTLINE-NEXT: orr x10, x8, x3 +; OUTLINE-NEXT: orr x11, x9, x2 +; OUTLINE-NEXT: stlxp w12, x11, x10, [x0] +; OUTLINE-NEXT: cbnz w12, .LBB5_1 +; OUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; OUTLINE-NEXT: adrp x10, :got:var +; OUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; OUTLINE-NEXT: stp x9, x8, [x10] +; OUTLINE-NEXT: ret +; +; LSE-LABEL: fetch_and_or: +; LSE: // %bb.0: +; LSE-NEXT: ldp x4, x5, [x0] +; LSE-NEXT: .LBB5_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov x7, x5 +; LSE-NEXT: mov x6, x4 +; LSE-NEXT: orr x8, x4, x2 +; LSE-NEXT: orr x9, x7, x3 +; LSE-NEXT: mov x4, x6 +; LSE-NEXT: mov x5, x7 +; LSE-NEXT: caspal x4, x5, x8, x9, [x0] +; LSE-NEXT: cmp x5, x7 +; LSE-NEXT: ccmp x4, x6, #0, eq +; LSE-NEXT: b.ne .LBB5_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: adrp x8, :got:var +; LSE-NEXT: ldr x8, [x8, :got_lo12:var] +; LSE-NEXT: stp x4, x5, [x8] +; LSE-NEXT: ret %val = atomicrmw or i128* %p, i128 %bits seq_cst store i128 %val, i128* @var, align 16 @@ -254,207 +330,554 @@ define void @fetch_and_or(i128* %p, i128 %bits) { } define void @fetch_and_add(i128* %p, i128 %bits) { -; CHECK-LABEL: fetch_and_add: -; CHECK: // %bb.0: -; CHECK-NEXT: .LBB6_1: // %atomicrmw.start -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxp x9, x8, [x0] -; CHECK-NEXT: adds x10, x9, x2 -; CHECK-NEXT: adc x11, x8, x3 -; CHECK-NEXT: stlxp w12, x10, x11, [x0] -; CHECK-NEXT: cbnz w12, .LBB6_1 -; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: adrp x10, :got:var -; CHECK-NEXT: ldr x10, [x10, :got_lo12:var] -; CHECK-NEXT: stp x9, x8, [x10] -; CHECK-NEXT: ret - +; NOOUTLINE-LABEL: fetch_and_add: +; NOOUTLINE: // %bb.0: +; NOOUTLINE-NEXT: .LBB6_1: // %atomicrmw.start +; NOOUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOOUTLINE-NEXT: ldaxp x9, x8, [x0] +; NOOUTLINE-NEXT: adds x10, x9, x2 +; NOOUTLINE-NEXT: adc x11, x8, x3 +; NOOUTLINE-NEXT: stlxp w12, x10, x11, [x0] +; NOOUTLINE-NEXT: cbnz w12, .LBB6_1 +; NOOUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; NOOUTLINE-NEXT: adrp x10, :got:var +; NOOUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; NOOUTLINE-NEXT: stp x9, x8, [x10] +; NOOUTLINE-NEXT: ret +; +; OUTLINE-LABEL: fetch_and_add: +; OUTLINE: // %bb.0: +; OUTLINE-NEXT: .LBB6_1: // %atomicrmw.start +; OUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; OUTLINE-NEXT: ldaxp x9, x8, [x0] +; OUTLINE-NEXT: adds x10, x9, x2 +; OUTLINE-NEXT: adc x11, x8, x3 +; OUTLINE-NEXT: stlxp w12, x10, x11, [x0] +; OUTLINE-NEXT: cbnz w12, .LBB6_1 +; OUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; OUTLINE-NEXT: adrp x10, :got:var +; OUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; OUTLINE-NEXT: stp x9, x8, [x10] +; OUTLINE-NEXT: ret +; +; LSE-LABEL: fetch_and_add: +; LSE: // %bb.0: +; LSE-NEXT: ldp x4, x5, [x0] +; LSE-NEXT: .LBB6_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov x7, x5 +; LSE-NEXT: mov x6, x4 +; LSE-NEXT: adds x8, x4, x2 +; LSE-NEXT: adc x9, x7, x3 +; LSE-NEXT: mov x4, x6 +; LSE-NEXT: mov x5, x7 +; LSE-NEXT: caspal x4, x5, x8, x9, [x0] +; LSE-NEXT: cmp x5, x7 +; LSE-NEXT: ccmp x4, x6, #0, eq +; LSE-NEXT: b.ne .LBB6_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: adrp x8, :got:var +; LSE-NEXT: ldr x8, [x8, :got_lo12:var] +; LSE-NEXT: stp x4, x5, [x8] +; LSE-NEXT: ret %val = atomicrmw add i128* %p, i128 %bits seq_cst store i128 %val, i128* @var, align 16 ret void } define void @fetch_and_sub(i128* %p, i128 %bits) { -; CHECK-LABEL: fetch_and_sub: -; CHECK: // %bb.0: -; CHECK-NEXT: .LBB7_1: // %atomicrmw.start -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxp x9, x8, [x0] -; CHECK-NEXT: subs x10, x9, x2 -; CHECK-NEXT: sbc x11, x8, x3 -; CHECK-NEXT: stlxp w12, x10, x11, [x0] -; CHECK-NEXT: cbnz w12, .LBB7_1 -; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: adrp x10, :got:var -; CHECK-NEXT: ldr x10, [x10, :got_lo12:var] -; CHECK-NEXT: stp x9, x8, [x10] -; CHECK-NEXT: ret - +; NOOUTLINE-LABEL: fetch_and_sub: +; NOOUTLINE: // %bb.0: +; NOOUTLINE-NEXT: .LBB7_1: // %atomicrmw.start +; NOOUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOOUTLINE-NEXT: ldaxp x9, x8, [x0] +; NOOUTLINE-NEXT: subs x10, x9, x2 +; NOOUTLINE-NEXT: sbc x11, x8, x3 +; NOOUTLINE-NEXT: stlxp w12, x10, x11, [x0] +; NOOUTLINE-NEXT: cbnz w12, .LBB7_1 +; NOOUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; NOOUTLINE-NEXT: adrp x10, :got:var +; NOOUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; NOOUTLINE-NEXT: stp x9, x8, [x10] +; NOOUTLINE-NEXT: ret +; +; OUTLINE-LABEL: fetch_and_sub: +; OUTLINE: // %bb.0: +; OUTLINE-NEXT: .LBB7_1: // %atomicrmw.start +; OUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; OUTLINE-NEXT: ldaxp x9, x8, [x0] +; OUTLINE-NEXT: subs x10, x9, x2 +; OUTLINE-NEXT: sbc x11, x8, x3 +; OUTLINE-NEXT: stlxp w12, x10, x11, [x0] +; OUTLINE-NEXT: cbnz w12, .LBB7_1 +; OUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; OUTLINE-NEXT: adrp x10, :got:var +; OUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; OUTLINE-NEXT: stp x9, x8, [x10] +; OUTLINE-NEXT: ret +; +; LSE-LABEL: fetch_and_sub: +; LSE: // %bb.0: +; LSE-NEXT: ldp x4, x5, [x0] +; LSE-NEXT: .LBB7_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov x7, x5 +; LSE-NEXT: mov x6, x4 +; LSE-NEXT: subs x8, x4, x2 +; LSE-NEXT: sbc x9, x7, x3 +; LSE-NEXT: mov x4, x6 +; LSE-NEXT: mov x5, x7 +; LSE-NEXT: caspal x4, x5, x8, x9, [x0] +; LSE-NEXT: cmp x5, x7 +; LSE-NEXT: ccmp x4, x6, #0, eq +; LSE-NEXT: b.ne .LBB7_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: adrp x8, :got:var +; LSE-NEXT: ldr x8, [x8, :got_lo12:var] +; LSE-NEXT: stp x4, x5, [x8] +; LSE-NEXT: ret %val = atomicrmw sub i128* %p, i128 %bits seq_cst store i128 %val, i128* @var, align 16 ret void } define void @fetch_and_min(i128* %p, i128 %bits) { -; CHECK-LABEL: fetch_and_min: -; CHECK: // %bb.0: -; CHECK-NEXT: .LBB8_1: // %atomicrmw.start -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxp x9, x8, [x0] -; CHECK-NEXT: cmp x2, x9 -; CHECK-NEXT: sbcs xzr, x3, x8 -; CHECK-NEXT: csel x10, x8, x3, ge -; CHECK-NEXT: csel x11, x9, x2, ge -; CHECK-NEXT: stlxp w12, x11, x10, [x0] -; CHECK-NEXT: cbnz w12, .LBB8_1 -; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: adrp x10, :got:var -; CHECK-NEXT: ldr x10, [x10, :got_lo12:var] -; CHECK-NEXT: stp x9, x8, [x10] -; CHECK-NEXT: ret - +; NOOUTLINE-LABEL: fetch_and_min: +; NOOUTLINE: // %bb.0: +; NOOUTLINE-NEXT: .LBB8_1: // %atomicrmw.start +; NOOUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOOUTLINE-NEXT: ldaxp x9, x8, [x0] +; NOOUTLINE-NEXT: cmp x2, x9 +; NOOUTLINE-NEXT: sbcs xzr, x3, x8 +; NOOUTLINE-NEXT: csel x10, x8, x3, ge +; NOOUTLINE-NEXT: csel x11, x9, x2, ge +; NOOUTLINE-NEXT: stlxp w12, x11, x10, [x0] +; NOOUTLINE-NEXT: cbnz w12, .LBB8_1 +; NOOUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; NOOUTLINE-NEXT: adrp x10, :got:var +; NOOUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; NOOUTLINE-NEXT: stp x9, x8, [x10] +; NOOUTLINE-NEXT: ret +; +; OUTLINE-LABEL: fetch_and_min: +; OUTLINE: // %bb.0: +; OUTLINE-NEXT: .LBB8_1: // %atomicrmw.start +; OUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; OUTLINE-NEXT: ldaxp x9, x8, [x0] +; OUTLINE-NEXT: cmp x2, x9 +; OUTLINE-NEXT: sbcs xzr, x3, x8 +; OUTLINE-NEXT: csel x10, x8, x3, ge +; OUTLINE-NEXT: csel x11, x9, x2, ge +; OUTLINE-NEXT: stlxp w12, x11, x10, [x0] +; OUTLINE-NEXT: cbnz w12, .LBB8_1 +; OUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; OUTLINE-NEXT: adrp x10, :got:var +; OUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; OUTLINE-NEXT: stp x9, x8, [x10] +; OUTLINE-NEXT: ret +; +; LSE-LABEL: fetch_and_min: +; LSE: // %bb.0: +; LSE-NEXT: ldp x4, x5, [x0] +; LSE-NEXT: .LBB8_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov x7, x5 +; LSE-NEXT: mov x6, x4 +; LSE-NEXT: cmp x2, x4 +; LSE-NEXT: sbcs xzr, x3, x7 +; LSE-NEXT: csel x9, x7, x3, ge +; LSE-NEXT: csel x8, x4, x2, ge +; LSE-NEXT: mov x4, x6 +; LSE-NEXT: mov x5, x7 +; LSE-NEXT: caspal x4, x5, x8, x9, [x0] +; LSE-NEXT: cmp x5, x7 +; LSE-NEXT: ccmp x4, x6, #0, eq +; LSE-NEXT: b.ne .LBB8_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: adrp x8, :got:var +; LSE-NEXT: ldr x8, [x8, :got_lo12:var] +; LSE-NEXT: stp x4, x5, [x8] +; LSE-NEXT: ret %val = atomicrmw min i128* %p, i128 %bits seq_cst store i128 %val, i128* @var, align 16 ret void } define void @fetch_and_max(i128* %p, i128 %bits) { -; CHECK-LABEL: fetch_and_max: -; CHECK: // %bb.0: -; CHECK-NEXT: .LBB9_1: // %atomicrmw.start -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxp x9, x8, [x0] -; CHECK-NEXT: cmp x2, x9 -; CHECK-NEXT: sbcs xzr, x3, x8 -; CHECK-NEXT: csel x10, x8, x3, lt -; CHECK-NEXT: csel x11, x9, x2, lt -; CHECK-NEXT: stlxp w12, x11, x10, [x0] -; CHECK-NEXT: cbnz w12, .LBB9_1 -; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: adrp x10, :got:var -; CHECK-NEXT: ldr x10, [x10, :got_lo12:var] -; CHECK-NEXT: stp x9, x8, [x10] -; CHECK-NEXT: ret - +; NOOUTLINE-LABEL: fetch_and_max: +; NOOUTLINE: // %bb.0: +; NOOUTLINE-NEXT: .LBB9_1: // %atomicrmw.start +; NOOUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOOUTLINE-NEXT: ldaxp x9, x8, [x0] +; NOOUTLINE-NEXT: cmp x2, x9 +; NOOUTLINE-NEXT: sbcs xzr, x3, x8 +; NOOUTLINE-NEXT: csel x10, x8, x3, lt +; NOOUTLINE-NEXT: csel x11, x9, x2, lt +; NOOUTLINE-NEXT: stlxp w12, x11, x10, [x0] +; NOOUTLINE-NEXT: cbnz w12, .LBB9_1 +; NOOUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; NOOUTLINE-NEXT: adrp x10, :got:var +; NOOUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; NOOUTLINE-NEXT: stp x9, x8, [x10] +; NOOUTLINE-NEXT: ret +; +; OUTLINE-LABEL: fetch_and_max: +; OUTLINE: // %bb.0: +; OUTLINE-NEXT: .LBB9_1: // %atomicrmw.start +; OUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; OUTLINE-NEXT: ldaxp x9, x8, [x0] +; OUTLINE-NEXT: cmp x2, x9 +; OUTLINE-NEXT: sbcs xzr, x3, x8 +; OUTLINE-NEXT: csel x10, x8, x3, lt +; OUTLINE-NEXT: csel x11, x9, x2, lt +; OUTLINE-NEXT: stlxp w12, x11, x10, [x0] +; OUTLINE-NEXT: cbnz w12, .LBB9_1 +; OUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; OUTLINE-NEXT: adrp x10, :got:var +; OUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; OUTLINE-NEXT: stp x9, x8, [x10] +; OUTLINE-NEXT: ret +; +; LSE-LABEL: fetch_and_max: +; LSE: // %bb.0: +; LSE-NEXT: ldp x4, x5, [x0] +; LSE-NEXT: .LBB9_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov x7, x5 +; LSE-NEXT: mov x6, x4 +; LSE-NEXT: cmp x2, x4 +; LSE-NEXT: sbcs xzr, x3, x7 +; LSE-NEXT: csel x9, x7, x3, lt +; LSE-NEXT: csel x8, x4, x2, lt +; LSE-NEXT: mov x4, x6 +; LSE-NEXT: mov x5, x7 +; LSE-NEXT: caspal x4, x5, x8, x9, [x0] +; LSE-NEXT: cmp x5, x7 +; LSE-NEXT: ccmp x4, x6, #0, eq +; LSE-NEXT: b.ne .LBB9_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: adrp x8, :got:var +; LSE-NEXT: ldr x8, [x8, :got_lo12:var] +; LSE-NEXT: stp x4, x5, [x8] +; LSE-NEXT: ret %val = atomicrmw max i128* %p, i128 %bits seq_cst store i128 %val, i128* @var, align 16 ret void } define void @fetch_and_umin(i128* %p, i128 %bits) { -; CHECK-LABEL: fetch_and_umin: -; CHECK: // %bb.0: -; CHECK-NEXT: .LBB10_1: // %atomicrmw.start -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxp x9, x8, [x0] -; CHECK-NEXT: cmp x2, x9 -; CHECK-NEXT: sbcs xzr, x3, x8 -; CHECK-NEXT: csel x10, x8, x3, hs -; CHECK-NEXT: csel x11, x9, x2, hs -; CHECK-NEXT: stlxp w12, x11, x10, [x0] -; CHECK-NEXT: cbnz w12, .LBB10_1 -; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: adrp x10, :got:var -; CHECK-NEXT: ldr x10, [x10, :got_lo12:var] -; CHECK-NEXT: stp x9, x8, [x10] -; CHECK-NEXT: ret - +; NOOUTLINE-LABEL: fetch_and_umin: +; NOOUTLINE: // %bb.0: +; NOOUTLINE-NEXT: .LBB10_1: // %atomicrmw.start +; NOOUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOOUTLINE-NEXT: ldaxp x9, x8, [x0] +; NOOUTLINE-NEXT: cmp x2, x9 +; NOOUTLINE-NEXT: sbcs xzr, x3, x8 +; NOOUTLINE-NEXT: csel x10, x8, x3, hs +; NOOUTLINE-NEXT: csel x11, x9, x2, hs +; NOOUTLINE-NEXT: stlxp w12, x11, x10, [x0] +; NOOUTLINE-NEXT: cbnz w12, .LBB10_1 +; NOOUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; NOOUTLINE-NEXT: adrp x10, :got:var +; NOOUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; NOOUTLINE-NEXT: stp x9, x8, [x10] +; NOOUTLINE-NEXT: ret +; +; OUTLINE-LABEL: fetch_and_umin: +; OUTLINE: // %bb.0: +; OUTLINE-NEXT: .LBB10_1: // %atomicrmw.start +; OUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; OUTLINE-NEXT: ldaxp x9, x8, [x0] +; OUTLINE-NEXT: cmp x2, x9 +; OUTLINE-NEXT: sbcs xzr, x3, x8 +; OUTLINE-NEXT: csel x10, x8, x3, hs +; OUTLINE-NEXT: csel x11, x9, x2, hs +; OUTLINE-NEXT: stlxp w12, x11, x10, [x0] +; OUTLINE-NEXT: cbnz w12, .LBB10_1 +; OUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; OUTLINE-NEXT: adrp x10, :got:var +; OUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; OUTLINE-NEXT: stp x9, x8, [x10] +; OUTLINE-NEXT: ret +; +; LSE-LABEL: fetch_and_umin: +; LSE: // %bb.0: +; LSE-NEXT: ldp x4, x5, [x0] +; LSE-NEXT: .LBB10_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov x7, x5 +; LSE-NEXT: mov x6, x4 +; LSE-NEXT: cmp x2, x4 +; LSE-NEXT: sbcs xzr, x3, x7 +; LSE-NEXT: csel x9, x7, x3, hs +; LSE-NEXT: csel x8, x4, x2, hs +; LSE-NEXT: mov x4, x6 +; LSE-NEXT: mov x5, x7 +; LSE-NEXT: caspal x4, x5, x8, x9, [x0] +; LSE-NEXT: cmp x5, x7 +; LSE-NEXT: ccmp x4, x6, #0, eq +; LSE-NEXT: b.ne .LBB10_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: adrp x8, :got:var +; LSE-NEXT: ldr x8, [x8, :got_lo12:var] +; LSE-NEXT: stp x4, x5, [x8] +; LSE-NEXT: ret %val = atomicrmw umin i128* %p, i128 %bits seq_cst store i128 %val, i128* @var, align 16 ret void } define void @fetch_and_umax(i128* %p, i128 %bits) { -; CHECK-LABEL: fetch_and_umax: -; CHECK: // %bb.0: -; CHECK-NEXT: .LBB11_1: // %atomicrmw.start -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxp x9, x8, [x0] -; CHECK-NEXT: cmp x2, x9 -; CHECK-NEXT: sbcs xzr, x3, x8 -; CHECK-NEXT: csel x10, x8, x3, lo -; CHECK-NEXT: csel x11, x9, x2, lo -; CHECK-NEXT: stlxp w12, x11, x10, [x0] -; CHECK-NEXT: cbnz w12, .LBB11_1 -; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: adrp x10, :got:var -; CHECK-NEXT: ldr x10, [x10, :got_lo12:var] -; CHECK-NEXT: stp x9, x8, [x10] -; CHECK-NEXT: ret - +; NOOUTLINE-LABEL: fetch_and_umax: +; NOOUTLINE: // %bb.0: +; NOOUTLINE-NEXT: .LBB11_1: // %atomicrmw.start +; NOOUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOOUTLINE-NEXT: ldaxp x9, x8, [x0] +; NOOUTLINE-NEXT: cmp x2, x9 +; NOOUTLINE-NEXT: sbcs xzr, x3, x8 +; NOOUTLINE-NEXT: csel x10, x8, x3, lo +; NOOUTLINE-NEXT: csel x11, x9, x2, lo +; NOOUTLINE-NEXT: stlxp w12, x11, x10, [x0] +; NOOUTLINE-NEXT: cbnz w12, .LBB11_1 +; NOOUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; NOOUTLINE-NEXT: adrp x10, :got:var +; NOOUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; NOOUTLINE-NEXT: stp x9, x8, [x10] +; NOOUTLINE-NEXT: ret +; +; OUTLINE-LABEL: fetch_and_umax: +; OUTLINE: // %bb.0: +; OUTLINE-NEXT: .LBB11_1: // %atomicrmw.start +; OUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; OUTLINE-NEXT: ldaxp x9, x8, [x0] +; OUTLINE-NEXT: cmp x2, x9 +; OUTLINE-NEXT: sbcs xzr, x3, x8 +; OUTLINE-NEXT: csel x10, x8, x3, lo +; OUTLINE-NEXT: csel x11, x9, x2, lo +; OUTLINE-NEXT: stlxp w12, x11, x10, [x0] +; OUTLINE-NEXT: cbnz w12, .LBB11_1 +; OUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; OUTLINE-NEXT: adrp x10, :got:var +; OUTLINE-NEXT: ldr x10, [x10, :got_lo12:var] +; OUTLINE-NEXT: stp x9, x8, [x10] +; OUTLINE-NEXT: ret +; +; LSE-LABEL: fetch_and_umax: +; LSE: // %bb.0: +; LSE-NEXT: ldp x4, x5, [x0] +; LSE-NEXT: .LBB11_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov x7, x5 +; LSE-NEXT: mov x6, x4 +; LSE-NEXT: cmp x2, x4 +; LSE-NEXT: sbcs xzr, x3, x7 +; LSE-NEXT: csel x9, x7, x3, lo +; LSE-NEXT: csel x8, x4, x2, lo +; LSE-NEXT: mov x4, x6 +; LSE-NEXT: mov x5, x7 +; LSE-NEXT: caspal x4, x5, x8, x9, [x0] +; LSE-NEXT: cmp x5, x7 +; LSE-NEXT: ccmp x4, x6, #0, eq +; LSE-NEXT: b.ne .LBB11_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: adrp x8, :got:var +; LSE-NEXT: ldr x8, [x8, :got_lo12:var] +; LSE-NEXT: stp x4, x5, [x8] +; LSE-NEXT: ret %val = atomicrmw umax i128* %p, i128 %bits seq_cst store i128 %val, i128* @var, align 16 ret void } define i128 @atomic_load_seq_cst(i128* %p) { -; CHECK-LABEL: atomic_load_seq_cst: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: .LBB12_1: // %atomicrmw.start -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxp x0, x1, [x8] -; CHECK-NEXT: stlxp w9, x0, x1, [x8] -; CHECK-NEXT: cbnz w9, .LBB12_1 -; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: ret +; NOOUTLINE-LABEL: atomic_load_seq_cst: +; NOOUTLINE: // %bb.0: +; NOOUTLINE-NEXT: mov x8, x0 +; NOOUTLINE-NEXT: .LBB12_1: // %atomicrmw.start +; NOOUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOOUTLINE-NEXT: ldaxp x0, x1, [x8] +; NOOUTLINE-NEXT: stlxp w9, x0, x1, [x8] +; NOOUTLINE-NEXT: cbnz w9, .LBB12_1 +; NOOUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; NOOUTLINE-NEXT: ret +; +; OUTLINE-LABEL: atomic_load_seq_cst: +; OUTLINE: // %bb.0: +; OUTLINE-NEXT: mov x8, x0 +; OUTLINE-NEXT: .LBB12_1: // %atomicrmw.start +; OUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; OUTLINE-NEXT: ldaxp x0, x1, [x8] +; OUTLINE-NEXT: stlxp w9, x0, x1, [x8] +; OUTLINE-NEXT: cbnz w9, .LBB12_1 +; OUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; OUTLINE-NEXT: ret +; +; LSE-LABEL: atomic_load_seq_cst: +; LSE: // %bb.0: +; LSE-NEXT: mov x2, #0 +; LSE-NEXT: mov x3, #0 +; LSE-NEXT: caspal x2, x3, x2, x3, [x0] +; LSE-NEXT: mov x0, x2 +; LSE-NEXT: mov x1, x3 +; LSE-NEXT: ret %r = load atomic i128, i128* %p seq_cst, align 16 ret i128 %r } define i128 @atomic_load_relaxed(i64, i64, i128* %p) { -; CHECK-LABEL: atomic_load_relaxed: -; CHECK: // %bb.0: -; CHECK-NEXT: .LBB13_1: // %atomicrmw.start -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldxp x0, x1, [x2] -; CHECK-NEXT: stxp w8, x0, x1, [x2] -; CHECK-NEXT: cbnz w8, .LBB13_1 -; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: ret +; NOOUTLINE-LABEL: atomic_load_relaxed: +; NOOUTLINE: // %bb.0: +; NOOUTLINE-NEXT: .LBB13_1: // %atomicrmw.start +; NOOUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOOUTLINE-NEXT: ldxp x0, x1, [x2] +; NOOUTLINE-NEXT: stxp w8, x0, x1, [x2] +; NOOUTLINE-NEXT: cbnz w8, .LBB13_1 +; NOOUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; NOOUTLINE-NEXT: ret +; +; OUTLINE-LABEL: atomic_load_relaxed: +; OUTLINE: // %bb.0: +; OUTLINE-NEXT: .LBB13_1: // %atomicrmw.start +; OUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; OUTLINE-NEXT: ldxp x0, x1, [x2] +; OUTLINE-NEXT: stxp w8, x0, x1, [x2] +; OUTLINE-NEXT: cbnz w8, .LBB13_1 +; OUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; OUTLINE-NEXT: ret +; +; LSE-LABEL: atomic_load_relaxed: +; LSE: // %bb.0: +; LSE-NEXT: mov x0, #0 +; LSE-NEXT: mov x1, #0 +; LSE-NEXT: casp x0, x1, x0, x1, [x2] +; LSE-NEXT: ret %r = load atomic i128, i128* %p monotonic, align 16 ret i128 %r } define void @atomic_store_seq_cst(i128 %in, i128* %p) { -; CHECK-LABEL: atomic_store_seq_cst: -; CHECK: // %bb.0: -; CHECK-NEXT: .LBB14_1: // %atomicrmw.start -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldaxp xzr, x8, [x2] -; CHECK-NEXT: stlxp w8, x0, x1, [x2] -; CHECK-NEXT: cbnz w8, .LBB14_1 -; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: ret +; NOOUTLINE-LABEL: atomic_store_seq_cst: +; NOOUTLINE: // %bb.0: +; NOOUTLINE-NEXT: .LBB14_1: // %atomicrmw.start +; NOOUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOOUTLINE-NEXT: ldaxp xzr, x8, [x2] +; NOOUTLINE-NEXT: stlxp w8, x0, x1, [x2] +; NOOUTLINE-NEXT: cbnz w8, .LBB14_1 +; NOOUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; NOOUTLINE-NEXT: ret +; +; OUTLINE-LABEL: atomic_store_seq_cst: +; OUTLINE: // %bb.0: +; OUTLINE-NEXT: .LBB14_1: // %atomicrmw.start +; OUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; OUTLINE-NEXT: ldaxp xzr, x8, [x2] +; OUTLINE-NEXT: stlxp w8, x0, x1, [x2] +; OUTLINE-NEXT: cbnz w8, .LBB14_1 +; OUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; OUTLINE-NEXT: ret +; +; LSE-LABEL: atomic_store_seq_cst: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $x1 killed $x1 killed $x0_x1 def $x0_x1 +; LSE-NEXT: ldp x4, x5, [x2] +; LSE-NEXT: // kill: def $x0 killed $x0 killed $x0_x1 def $x0_x1 +; LSE-NEXT: .LBB14_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov x6, x4 +; LSE-NEXT: mov x7, x5 +; LSE-NEXT: caspal x6, x7, x0, x1, [x2] +; LSE-NEXT: cmp x7, x5 +; LSE-NEXT: ccmp x6, x4, #0, eq +; LSE-NEXT: mov x4, x6 +; LSE-NEXT: mov x5, x7 +; LSE-NEXT: b.ne .LBB14_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: ret store atomic i128 %in, i128* %p seq_cst, align 16 ret void } define void @atomic_store_release(i128 %in, i128* %p) { -; CHECK-LABEL: atomic_store_release: -; CHECK: // %bb.0: -; CHECK-NEXT: .LBB15_1: // %atomicrmw.start -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldxp xzr, x8, [x2] -; CHECK-NEXT: stlxp w8, x0, x1, [x2] -; CHECK-NEXT: cbnz w8, .LBB15_1 -; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: ret +; NOOUTLINE-LABEL: atomic_store_release: +; NOOUTLINE: // %bb.0: +; NOOUTLINE-NEXT: .LBB15_1: // %atomicrmw.start +; NOOUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOOUTLINE-NEXT: ldxp xzr, x8, [x2] +; NOOUTLINE-NEXT: stlxp w8, x0, x1, [x2] +; NOOUTLINE-NEXT: cbnz w8, .LBB15_1 +; NOOUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; NOOUTLINE-NEXT: ret +; +; OUTLINE-LABEL: atomic_store_release: +; OUTLINE: // %bb.0: +; OUTLINE-NEXT: .LBB15_1: // %atomicrmw.start +; OUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; OUTLINE-NEXT: ldxp xzr, x8, [x2] +; OUTLINE-NEXT: stlxp w8, x0, x1, [x2] +; OUTLINE-NEXT: cbnz w8, .LBB15_1 +; OUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; OUTLINE-NEXT: ret +; +; LSE-LABEL: atomic_store_release: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $x1 killed $x1 killed $x0_x1 def $x0_x1 +; LSE-NEXT: ldp x4, x5, [x2] +; LSE-NEXT: // kill: def $x0 killed $x0 killed $x0_x1 def $x0_x1 +; LSE-NEXT: .LBB15_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov x6, x4 +; LSE-NEXT: mov x7, x5 +; LSE-NEXT: caspl x6, x7, x0, x1, [x2] +; LSE-NEXT: cmp x7, x5 +; LSE-NEXT: ccmp x6, x4, #0, eq +; LSE-NEXT: mov x4, x6 +; LSE-NEXT: mov x5, x7 +; LSE-NEXT: b.ne .LBB15_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: ret store atomic i128 %in, i128* %p release, align 16 ret void } define void @atomic_store_relaxed(i128 %in, i128* %p) { -; CHECK-LABEL: atomic_store_relaxed: -; CHECK: // %bb.0: -; CHECK-NEXT: .LBB16_1: // %atomicrmw.start -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldxp xzr, x8, [x2] -; CHECK-NEXT: stxp w8, x0, x1, [x2] -; CHECK-NEXT: cbnz w8, .LBB16_1 -; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: ret +; NOOUTLINE-LABEL: atomic_store_relaxed: +; NOOUTLINE: // %bb.0: +; NOOUTLINE-NEXT: .LBB16_1: // %atomicrmw.start +; NOOUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOOUTLINE-NEXT: ldxp xzr, x8, [x2] +; NOOUTLINE-NEXT: stxp w8, x0, x1, [x2] +; NOOUTLINE-NEXT: cbnz w8, .LBB16_1 +; NOOUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; NOOUTLINE-NEXT: ret +; +; OUTLINE-LABEL: atomic_store_relaxed: +; OUTLINE: // %bb.0: +; OUTLINE-NEXT: .LBB16_1: // %atomicrmw.start +; OUTLINE-NEXT: // =>This Inner Loop Header: Depth=1 +; OUTLINE-NEXT: ldxp xzr, x8, [x2] +; OUTLINE-NEXT: stxp w8, x0, x1, [x2] +; OUTLINE-NEXT: cbnz w8, .LBB16_1 +; OUTLINE-NEXT: // %bb.2: // %atomicrmw.end +; OUTLINE-NEXT: ret +; +; LSE-LABEL: atomic_store_relaxed: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $x1 killed $x1 killed $x0_x1 def $x0_x1 +; LSE-NEXT: ldp x4, x5, [x2] +; LSE-NEXT: // kill: def $x0 killed $x0 killed $x0_x1 def $x0_x1 +; LSE-NEXT: .LBB16_1: // %atomicrmw.start +; LSE-NEXT: // =>This Inner Loop Header: Depth=1 +; LSE-NEXT: mov x6, x4 +; LSE-NEXT: mov x7, x5 +; LSE-NEXT: casp x6, x7, x0, x1, [x2] +; LSE-NEXT: cmp x7, x5 +; LSE-NEXT: ccmp x6, x4, #0, eq +; LSE-NEXT: mov x4, x6 +; LSE-NEXT: mov x5, x7 +; LSE-NEXT: b.ne .LBB16_1 +; LSE-NEXT: // %bb.2: // %atomicrmw.end +; LSE-NEXT: ret store atomic i128 %in, i128* %p unordered, align 16 ret void } diff --git a/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll b/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll index 512a28b..24af03c 100644 --- a/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll +++ b/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll @@ -9437,8 +9437,7 @@ define dso_local void @test_atomic_load_xor_i64_noret_seq_cst(i64 %offset) nounw define dso_local i128 @test_atomic_load_i128() nounwind { ; CHECK-LABEL: test_atomic_load_i128: -; CHECK: ldxp -; CHECK: stxp +; CHECK: casp ; OUTLINE-ATOMICS-LABEL: test_atomic_load_i128: ; OUTLINE-ATOMICS: ldxp diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-xchg-fp.ll b/llvm/test/CodeGen/AArch64/atomicrmw-xchg-fp.ll index 2cc6b48..cb2c5c5 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-xchg-fp.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-xchg-fp.ll @@ -97,14 +97,20 @@ define fp128 @test_rmw_xchg_f128(fp128* %dst, fp128 %new) { ; LSE-NEXT: sub sp, sp, #32 ; LSE-NEXT: .cfi_def_cfa_offset 32 ; LSE-NEXT: str q0, [sp, #16] -; LSE-NEXT: ldp x9, x8, [sp, #16] +; LSE-NEXT: ldp x2, x3, [sp, #16] +; LSE-NEXT: ldp x4, x5, [x0] ; LSE-NEXT: .LBB3_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: ldaxp x11, x10, [x0] -; LSE-NEXT: stlxp w12, x9, x8, [x0] -; LSE-NEXT: cbnz w12, .LBB3_1 +; LSE-NEXT: mov x7, x5 +; LSE-NEXT: mov x6, x4 +; LSE-NEXT: mov x5, x7 +; LSE-NEXT: mov x4, x6 +; LSE-NEXT: caspal x4, x5, x2, x3, [x0] +; LSE-NEXT: cmp x5, x7 +; LSE-NEXT: ccmp x4, x6, #0, eq +; LSE-NEXT: b.ne .LBB3_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: stp x11, x10, [sp] +; LSE-NEXT: stp x4, x5, [sp] ; LSE-NEXT: ldr q0, [sp], #32 ; LSE-NEXT: ret %res = atomicrmw xchg fp128* %dst, fp128 %new seq_cst -- 2.7.4