From 00e372137c8a604b59b995131cebf2d84f097544 Mon Sep 17 00:00:00 2001 From: Tim Northover Date: Wed, 9 Feb 2022 11:42:36 +0000 Subject: [PATCH] AArch64: do not use xzr for ldxp -> stxp dataflow. If the result of a cmpxchg is unused, regalloc chooses `xzr` for the defs of CMP_SWAP_128*. However, on the failure path this gets expanded to a LDXP -> STXP to store the original value (to ensure no tearing occurred). This unintentionally nulls out half of the value. So instead use GPR64common for these defs, so regalloc has to choose a real one. --- llvm/lib/Target/AArch64/AArch64InstrAtomics.td | 3 +- .../AArch64/GlobalISel/legalize-cmpxchg-128.mir | 56 +++++++++++----------- llvm/test/CodeGen/AArch64/arm64-atomic-128.ll | 49 +++++++++++++++++++ 3 files changed, 80 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td index b220929..7d62b9e 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -490,7 +490,8 @@ def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$scratch), let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $scratch", mayLoad = 1, mayStore = 1 in { -class cmp_swap_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32common:$scratch), +class cmp_swap_128 : Pseudo<(outs GPR64common:$RdLo, GPR64common:$RdHi, + GPR32common:$scratch), (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi, GPR64:$newLo, GPR64:$newHi), []>, Sched<[WriteAtomic]>; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir index f051fe7..6a6e0b6 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir @@ -30,35 +30,37 @@ body: | ; CHECK: RET_ReallyLR ; CHECK-NOLSE-LABEL: name: compare_swap_128 ; CHECK-NOLSE: liveins: $x0_x1, $x1, $x0, $x1, $x2, $x3, $x4 - ; CHECK-NOLSE: [[COPY:%[0-9]+]]:gpr64(p0) = COPY $x0 - ; CHECK-NOLSE: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 - ; CHECK-NOLSE: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2 - ; CHECK-NOLSE: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3 - ; CHECK-NOLSE: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4 - ; CHECK-NOLSE: [[COPY5:%[0-9]+]]:gpr64(s64) = COPY [[COPY1]](s64) - ; CHECK-NOLSE: [[COPY6:%[0-9]+]]:gpr64(s64) = COPY [[COPY2]](s64) - ; CHECK-NOLSE: [[COPY7:%[0-9]+]]:gpr64(s64) = COPY [[COPY3]](s64) - ; CHECK-NOLSE: [[COPY8:%[0-9]+]]:gpr64(s64) = COPY [[COPY4]](s64) - ; CHECK-NOLSE: early-clobber %13:gpr64(s64), early-clobber %14:gpr64(s64), early-clobber %16:gpr32common = CMP_SWAP_128_ACQUIRE [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire (s128)) - ; CHECK-NOLSE: [[COPY9:%[0-9]+]]:gpr64 = COPY %16 - ; CHECK-NOLSE: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES %13(s64), %14(s64) - ; CHECK-NOLSE: G_STORE [[MV]](s128), [[COPY]](p0) :: (store (s128)) - ; CHECK-NOLSE: RET_ReallyLR + ; CHECK-NOLSE-NEXT: {{ $}} + ; CHECK-NOLSE-NEXT: [[COPY:%[0-9]+]]:gpr64(p0) = COPY $x0 + ; CHECK-NOLSE-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 + ; CHECK-NOLSE-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2 + ; CHECK-NOLSE-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NOLSE-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NOLSE-NEXT: [[COPY5:%[0-9]+]]:gpr64(s64) = COPY [[COPY1]](s64) + ; CHECK-NOLSE-NEXT: [[COPY6:%[0-9]+]]:gpr64(s64) = COPY [[COPY2]](s64) + ; CHECK-NOLSE-NEXT: [[COPY7:%[0-9]+]]:gpr64(s64) = COPY [[COPY3]](s64) + ; CHECK-NOLSE-NEXT: [[COPY8:%[0-9]+]]:gpr64(s64) = COPY [[COPY4]](s64) + ; CHECK-NOLSE-NEXT: early-clobber %13:gpr64common(s64), early-clobber %14:gpr64common(s64), early-clobber %16:gpr32common = CMP_SWAP_128_ACQUIRE [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire (s128)) + ; CHECK-NOLSE-NEXT: [[COPY9:%[0-9]+]]:gpr64 = COPY %16 + ; CHECK-NOLSE-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES %13(s64), %14(s64) + ; CHECK-NOLSE-NEXT: G_STORE [[MV]](s128), [[COPY]](p0) :: (store (s128)) + ; CHECK-NOLSE-NEXT: RET_ReallyLR ; CHECK-LSE-LABEL: name: compare_swap_128 ; CHECK-LSE: liveins: $x0_x1, $x1, $x0, $x1, $x2, $x3, $x4 - ; CHECK-LSE: [[COPY:%[0-9]+]]:gpr64sp(p0) = COPY $x0 - ; CHECK-LSE: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 - ; CHECK-LSE: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2 - ; CHECK-LSE: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3 - ; CHECK-LSE: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4 - ; CHECK-LSE: [[REG_SEQUENCE:%[0-9]+]]:xseqpairsclass(s128) = REG_SEQUENCE [[COPY1]](s64), %subreg.sube64, [[COPY2]](s64), %subreg.subo64 - ; CHECK-LSE: [[REG_SEQUENCE1:%[0-9]+]]:xseqpairsclass(s128) = REG_SEQUENCE [[COPY3]](s64), %subreg.sube64, [[COPY4]](s64), %subreg.subo64 - ; CHECK-LSE: [[CASPAX:%[0-9]+]]:xseqpairsclass(s128) = CASPAX [[REG_SEQUENCE]](s128), [[REG_SEQUENCE1]](s128), [[COPY]](p0) :: (load store acquire acquire (s128)) - ; CHECK-LSE: [[EXTRACT:%[0-9]+]]:_(s64) = G_EXTRACT [[CASPAX]](s128), 0 - ; CHECK-LSE: [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[CASPAX]](s128), 64 - ; CHECK-LSE: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[EXTRACT]](s64), [[EXTRACT1]](s64) - ; CHECK-LSE: G_STORE [[MV]](s128), [[COPY]](p0) :: (store (s128)) - ; CHECK-LSE: RET_ReallyLR + ; CHECK-LSE-NEXT: {{ $}} + ; CHECK-LSE-NEXT: [[COPY:%[0-9]+]]:gpr64sp(p0) = COPY $x0 + ; CHECK-LSE-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 + ; CHECK-LSE-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2 + ; CHECK-LSE-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-LSE-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-LSE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:xseqpairsclass(s128) = REG_SEQUENCE [[COPY1]](s64), %subreg.sube64, [[COPY2]](s64), %subreg.subo64 + ; CHECK-LSE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:xseqpairsclass(s128) = REG_SEQUENCE [[COPY3]](s64), %subreg.sube64, [[COPY4]](s64), %subreg.subo64 + ; CHECK-LSE-NEXT: [[CASPAX:%[0-9]+]]:xseqpairsclass(s128) = CASPAX [[REG_SEQUENCE]](s128), [[REG_SEQUENCE1]](s128), [[COPY]](p0) :: (load store acquire acquire (s128)) + ; CHECK-LSE-NEXT: [[EXTRACT:%[0-9]+]]:_(s64) = G_EXTRACT [[CASPAX]](s128), 0 + ; CHECK-LSE-NEXT: [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[CASPAX]](s128), 64 + ; CHECK-LSE-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[EXTRACT]](s64), [[EXTRACT1]](s64) + ; CHECK-LSE-NEXT: G_STORE [[MV]](s128), [[COPY]](p0) :: (store (s128)) + ; CHECK-LSE-NEXT: RET_ReallyLR %0:_(p0) = COPY $x0 %3:_(s64) = COPY $x1 %4:_(s64) = COPY $x2 diff --git a/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll index d6aabd1..2f8d06e 100644 --- a/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll +++ b/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll @@ -474,3 +474,52 @@ define void @atomic_store_relaxed(i128 %in, i128* %p) { store atomic i128 %in, i128* %p unordered, align 16 ret void } + +; Since we store the original value to ensure no tearing for the unsuccessful +; case, the register used must not be xzr. +define void @cmpxchg_dead(i128* %ptr, i128 %desired, i128 %new) { +; NOOUTLINE-LABEL: cmpxchg_dead: +; NOOUTLINE: // %bb.0: +; NOOUTLINE-NEXT: .LBB17_1: // =>This Inner Loop Header: Depth=1 +; NOOUTLINE-NEXT: ldxp x8, x9, [x0] +; NOOUTLINE-NEXT: cmp x8, x2 +; NOOUTLINE-NEXT: cset w10, ne +; NOOUTLINE-NEXT: cmp x9, x3 +; NOOUTLINE-NEXT: cinc w10, w10, ne +; NOOUTLINE-NEXT: cbz w10, .LBB17_3 +; NOOUTLINE-NEXT: // %bb.2: // in Loop: Header=BB17_1 Depth=1 +; NOOUTLINE-NEXT: stxp w10, x8, x9, [x0] +; NOOUTLINE-NEXT: cbnz w10, .LBB17_1 +; NOOUTLINE-NEXT: b .LBB17_4 +; NOOUTLINE-NEXT: .LBB17_3: // in Loop: Header=BB17_1 Depth=1 +; NOOUTLINE-NEXT: stxp w10, x4, x5, [x0] +; NOOUTLINE-NEXT: cbnz w10, .LBB17_1 +; NOOUTLINE-NEXT: .LBB17_4: +; NOOUTLINE-NEXT: ret +; +; OUTLINE-LABEL: cmpxchg_dead: +; OUTLINE: // %bb.0: +; OUTLINE-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; OUTLINE-NEXT: .cfi_def_cfa_offset 16 +; OUTLINE-NEXT: .cfi_offset w30, -16 +; OUTLINE-NEXT: mov x1, x3 +; OUTLINE-NEXT: mov x8, x0 +; OUTLINE-NEXT: mov x0, x2 +; OUTLINE-NEXT: mov x2, x4 +; OUTLINE-NEXT: mov x3, x5 +; OUTLINE-NEXT: mov x4, x8 +; OUTLINE-NEXT: bl __aarch64_cas16_relax +; OUTLINE-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; OUTLINE-NEXT: ret +; +; LSE-LABEL: cmpxchg_dead: +; LSE: // %bb.0: +; LSE-NEXT: // kill: def $x5 killed $x5 killed $x4_x5 def $x4_x5 +; LSE-NEXT: // kill: def $x3 killed $x3 killed $x2_x3 def $x2_x3 +; LSE-NEXT: // kill: def $x4 killed $x4 killed $x4_x5 def $x4_x5 +; LSE-NEXT: // kill: def $x2 killed $x2 killed $x2_x3 def $x2_x3 +; LSE-NEXT: casp x2, x3, x4, x5, [x0] +; LSE-NEXT: ret + cmpxchg i128* %ptr, i128 %desired, i128 %new monotonic monotonic + ret void +} -- 2.7.4