From: Tim Northover Date: Thu, 14 Apr 2016 17:03:29 +0000 (+0000) Subject: AArch64: expand cmpxchg after regalloc at -O0. X-Git-Tag: llvmorg-3.9.0-rc1~9084 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=cdf1529c010d28a9102f770727cbdc2f74768de2;p=platform%2Fupstream%2Fllvm.git AArch64: expand cmpxchg after regalloc at -O0. FastRegAlloc works only at the basic-block level and spills all live-out registers. Unfortunately for a stack-based cmpxchg near the spill slots, this can perpetually clear the exclusive monitor, which means the cmpxchg will never succeed. I believe the only way to handle this within LLVM is by expanding the loop post-regalloc. We don't want this in general because it severely limits the optimisations that can be done, so we limit this to -O0 compilations. It's an ugly hack, and about the one good point in the whole mess is that we can treat all cmpxchg operations in the most naive way possible (seq_cst, no clrex faff) without affecting correctness. Should fix PR25526. llvm-svn: 266339 --- diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 610fcc4..4fb1bb1 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -17,6 +17,7 @@ #include "MCTargetDesc/AArch64AddressingModes.h" #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/Support/MathExtras.h" @@ -46,9 +47,18 @@ public: private: bool expandMBB(MachineBasicBlock &MBB); - bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); + bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned BitSize); + + bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + unsigned LdarOp, unsigned StlrOp, unsigned CmpOp, + unsigned ExtendImm, unsigned ZeroReg, + MachineBasicBlock::iterator &NextMBBI); + bool expandCMP_SWAP_128(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); }; char AArch64ExpandPseudo::ID = 0; } @@ -579,10 +589,176 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, return true; } +void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) { + for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) + MBB->addLiveIn(*I); +} + +bool AArch64ExpandPseudo::expandCMP_SWAP( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp, + unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg, + MachineBasicBlock::iterator &NextMBBI) { + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + MachineOperand &Dest = MI.getOperand(0); + unsigned StatusReg = MI.getOperand(1).getReg(); + MachineOperand &Addr = MI.getOperand(2); + MachineOperand &Desired = MI.getOperand(3); + MachineOperand &New = MI.getOperand(4); + + LivePhysRegs LiveRegs(&TII->getRegisterInfo()); + LiveRegs.addLiveOuts(&MBB); + for (auto I = std::prev(MBB.end()); I != MBBI; --I) + LiveRegs.stepBackward(*I); + + MachineFunction *MF = MBB.getParent(); + auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + MF->insert(++MBB.getIterator(), LoadCmpBB); + MF->insert(++LoadCmpBB->getIterator(), StoreBB); + MF->insert(++StoreBB->getIterator(), DoneBB); + + // .Lloadcmp: + // ldaxr xDest, [xAddr] + // cmp xDest, xDesired + // b.ne .Ldone + MBB.addSuccessor(LoadCmpBB); + LoadCmpBB->addLiveIn(Addr.getReg()); + LoadCmpBB->addLiveIn(Dest.getReg()); + LoadCmpBB->addLiveIn(Desired.getReg()); + addPostLoopLiveIns(LoadCmpBB, LiveRegs); + + BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg()) + .addReg(Addr.getReg()); + BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg) + .addReg(Dest.getReg(), getKillRegState(Dest.isDead())) + .addOperand(Desired) + .addImm(ExtendImm); + BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::NE) + .addMBB(DoneBB) + .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill); + LoadCmpBB->addSuccessor(DoneBB); + LoadCmpBB->addSuccessor(StoreBB); + + // .Lstore: + // stlxr wStatus, xNew, [xAddr] + // cbnz wStatus, .Lloadcmp + StoreBB->addLiveIn(Addr.getReg()); + StoreBB->addLiveIn(New.getReg()); + addPostLoopLiveIns(StoreBB, LiveRegs); + + BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg) + .addOperand(New) + .addOperand(Addr); + BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) + .addReg(StatusReg, RegState::Kill) + .addMBB(LoadCmpBB); + StoreBB->addSuccessor(LoadCmpBB); + StoreBB->addSuccessor(DoneBB); + + DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); + DoneBB->transferSuccessors(&MBB); + addPostLoopLiveIns(DoneBB, LiveRegs); + + NextMBBI = MBB.end(); + MI.eraseFromParent(); + return true; +} + +bool AArch64ExpandPseudo::expandCMP_SWAP_128( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + MachineOperand &DestLo = MI.getOperand(0); + MachineOperand &DestHi = MI.getOperand(1); + unsigned StatusReg = MI.getOperand(2).getReg(); + MachineOperand &Addr = MI.getOperand(3); + MachineOperand &DesiredLo = MI.getOperand(4); + MachineOperand &DesiredHi = MI.getOperand(5); + MachineOperand &NewLo = MI.getOperand(6); + MachineOperand &NewHi = MI.getOperand(7); + + LivePhysRegs LiveRegs(&TII->getRegisterInfo()); + LiveRegs.addLiveOuts(&MBB); + for (auto I = std::prev(MBB.end()); I != MBBI; --I) + LiveRegs.stepBackward(*I); + + MachineFunction *MF = MBB.getParent(); + auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + MF->insert(++MBB.getIterator(), LoadCmpBB); + MF->insert(++LoadCmpBB->getIterator(), StoreBB); + MF->insert(++StoreBB->getIterator(), DoneBB); + + // .Lloadcmp: + // ldaxp xDestLo, xDestHi, [xAddr] + // cmp xDestLo, xDesiredLo + // sbcs xDestHi, xDesiredHi + // b.ne .Ldone + MBB.addSuccessor(LoadCmpBB); + LoadCmpBB->addLiveIn(Addr.getReg()); + LoadCmpBB->addLiveIn(DestLo.getReg()); + LoadCmpBB->addLiveIn(DestHi.getReg()); + LoadCmpBB->addLiveIn(DesiredLo.getReg()); + LoadCmpBB->addLiveIn(DesiredHi.getReg()); + addPostLoopLiveIns(LoadCmpBB, LiveRegs); + + BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX)) + .addReg(DestLo.getReg(), RegState::Define) + .addReg(DestHi.getReg(), RegState::Define) + .addReg(Addr.getReg()); + BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) + .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead())) + .addOperand(DesiredLo) + .addImm(0); + BuildMI(LoadCmpBB, DL, TII->get(AArch64::SBCSXr), AArch64::XZR) + .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead())) + .addOperand(DesiredHi); + BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::NE) + .addMBB(DoneBB) + .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill); + LoadCmpBB->addSuccessor(DoneBB); + LoadCmpBB->addSuccessor(StoreBB); + + // .Lstore: + // stlxp wStatus, xNewLo, xNewHi, [xAddr] + // cbnz wStatus, .Lloadcmp + StoreBB->addLiveIn(Addr.getReg()); + StoreBB->addLiveIn(NewLo.getReg()); + StoreBB->addLiveIn(NewHi.getReg()); + addPostLoopLiveIns(StoreBB, LiveRegs); + BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg) + .addOperand(NewLo) + .addOperand(NewHi) + .addOperand(Addr); + BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) + .addReg(StatusReg, RegState::Kill) + .addMBB(LoadCmpBB); + StoreBB->addSuccessor(LoadCmpBB); + StoreBB->addSuccessor(DoneBB); + + DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); + DoneBB->transferSuccessors(&MBB); + addPostLoopLiveIns(DoneBB, LiveRegs); + + NextMBBI = MBB.end(); + MI.eraseFromParent(); + return true; +} + /// \brief If MBBI references a pseudo instruction that should be expanded here, /// do the expansion and return true. Otherwise return false. bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) { + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); switch (Opcode) { @@ -724,6 +900,28 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.eraseFromParent(); return true; } + case AArch64::CMP_SWAP_8: + return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRB, AArch64::STLXRB, + AArch64::SUBSWrx, + AArch64_AM::getArithExtendImm(AArch64_AM::UXTB, 0), + AArch64::WZR, NextMBBI); + case AArch64::CMP_SWAP_16: + return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRH, AArch64::STLXRH, + AArch64::SUBSWrx, + AArch64_AM::getArithExtendImm(AArch64_AM::UXTH, 0), + AArch64::WZR, NextMBBI); + case AArch64::CMP_SWAP_32: + return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRW, AArch64::STLXRW, + AArch64::SUBSWrs, + AArch64_AM::getShifterImm(AArch64_AM::LSL, 0), + AArch64::WZR, NextMBBI); + case AArch64::CMP_SWAP_64: + return expandCMP_SWAP(MBB, MBBI, + AArch64::LDAXRX, AArch64::STLXRX, AArch64::SUBSXrs, + AArch64_AM::getShifterImm(AArch64_AM::LSL, 0), + AArch64::XZR, NextMBBI); + case AArch64::CMP_SWAP_128: + return expandCMP_SWAP_128(MBB, MBBI, NextMBBI); } return false; } @@ -736,7 +934,7 @@ bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) { MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); while (MBBI != E) { MachineBasicBlock::iterator NMBBI = std::next(MBBI); - Modified |= expandMI(MBB, MBBI); + Modified |= expandMI(MBB, MBBI, NMBBI); MBBI = NMBBI; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index e439199..15eee80 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -198,6 +198,9 @@ private: } bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width); + + void SelectCMP_SWAP(SDNode *N); + }; } // end anonymous namespace @@ -2296,6 +2299,36 @@ SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) { return nullptr; } +/// We've got special pseudo-instructions for these +void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { + unsigned Opcode; + EVT MemTy = cast(N)->getMemoryVT(); + if (MemTy == MVT::i8) + Opcode = AArch64::CMP_SWAP_8; + else if (MemTy == MVT::i16) + Opcode = AArch64::CMP_SWAP_16; + else if (MemTy == MVT::i32) + Opcode = AArch64::CMP_SWAP_32; + else if (MemTy == MVT::i64) + Opcode = AArch64::CMP_SWAP_64; + else + llvm_unreachable("Unknown AtomicCmpSwap type"); + + MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32; + SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3), + N->getOperand(0)}; + SDNode *CmpSwap = CurDAG->getMachineNode( + Opcode, SDLoc(N), + CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops); + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(CmpSwap)->setMemRefs(MemOp, MemOp + 1); + + ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0)); + ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2)); +} + SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { // Dump information about the Node being selected DEBUG(errs() << "Selecting: "); @@ -2317,6 +2350,10 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { default: break; + case ISD::ATOMIC_CMP_SWAP: + SelectCMP_SWAP(Node); + return nullptr; + case ISD::READ_REGISTER: if (SDNode *Res = SelectReadRegister(Node)) return Res; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 4d7f774..580eaa5 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -412,6 +412,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. if (Subtarget->hasPerfMon()) @@ -10050,6 +10052,31 @@ static void ReplaceReductionResults(SDNode *N, Results.push_back(SplitVal); } +static void ReplaceCMP_SWAP_128Results(SDNode *N, + SmallVectorImpl & Results, + SelectionDAG &DAG) { + assert(N->getValueType(0) == MVT::i128 && + "AtomicCmpSwap on types less than 128 should be legal"); + SDValue Ops[] = {N->getOperand(1), + N->getOperand(2)->getOperand(0), + N->getOperand(2)->getOperand(1), + N->getOperand(3)->getOperand(0), + N->getOperand(3)->getOperand(1), + N->getOperand(0)}; + SDNode *CmpSwap = DAG.getMachineNode( + AArch64::CMP_SWAP_128, SDLoc(N), + DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops); + + MachineFunction &MF = DAG.getMachineFunction(); + MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(CmpSwap)->setMemRefs(MemOp, MemOp + 1); + + Results.push_back(SDValue(CmpSwap, 0)); + Results.push_back(SDValue(CmpSwap, 1)); + Results.push_back(SDValue(CmpSwap, 3)); +} + void AArch64TargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -10081,6 +10108,9 @@ void AArch64TargetLowering::ReplaceNodeResults( assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); // Let normal code take care of it by not adding anything to Results. return; + case ISD::ATOMIC_CMP_SWAP: + ReplaceCMP_SWAP_128Results(N, Results, DAG); + return; } } @@ -10132,7 +10162,12 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { - return true; + // At -O0, fast-regalloc cannot cope with the live vregs necessary to + // implement cmpxchg without spilling. If the address being exchanged is also + // on the stack and close enough to the spill slot, this can lead to a + // situation where the monitor always gets cleared and the atomic operation + // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. + return getTargetMachine().getOptLevel() != 0; } Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td index a88e7e8..59de62a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -362,3 +362,43 @@ def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr), // And clear exclusive. def : Pat<(int_aarch64_clrex), (CLREX 0xf)>; + +//===---------------------------------- +// Atomic cmpxchg for -O0 +//===---------------------------------- + +// The fast register allocator used during -O0 inserts spills to cover any VRegs +// live across basic block boundaries. When this happens between an LDXR and an +// STXR it can clear the exclusive monitor, causing all cmpxchg attempts to +// fail. + +// Unfortunately, this means we have to have an alternative (expanded +// post-regalloc) path for -O0 compilations. Fortunately this path can be +// significantly more naive than the standard expansion: we conservatively +// assume seq_cst, strong cmpxchg and omit clrex on failure. + +let Constraints = "@earlyclobber $Rd,@earlyclobber $status", + mayLoad = 1, mayStore = 1 in { +def CMP_SWAP_8 : Pseudo<(outs GPR32:$Rd, GPR32:$status), + (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>, + Sched<[WriteAtomic]>; + +def CMP_SWAP_16 : Pseudo<(outs GPR32:$Rd, GPR32:$status), + (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>, + Sched<[WriteAtomic]>; + +def CMP_SWAP_32 : Pseudo<(outs GPR32:$Rd, GPR32:$status), + (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>, + Sched<[WriteAtomic]>; + +def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$status), + (ins GPR64:$addr, GPR64:$desired, GPR64:$new), []>, + Sched<[WriteAtomic]>; +} + +let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $status", + mayLoad = 1, mayStore = 1 in +def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$status), + (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi, + GPR64:$newLo, GPR64:$newHi), []>, + Sched<[WriteAtomic]>; diff --git a/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll b/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll new file mode 100644 index 0000000..6c6b022 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll @@ -0,0 +1,75 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -O0 %s -o - | FileCheck %s + +define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind { +; CHECK-LABEL: test_cmpxchg_8: +; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxrb [[OLD:w[0-9]+]], [x0] +; CHECK: cmp [[OLD]], w1, uxtb +; CHECK: b.ne [[DONE:.LBB[0-9]+_[0-9]+]] +; CHECK: stlxrb [[STATUS:w[3-9]]], w2, [x0] +; CHECK: cbnz [[STATUS]], [[RETRY]] +; CHECK: [[DONE]]: +; CHECK: subs {{w[0-9]+}}, [[OLD]], w1 +; CHECK: cset {{w[0-9]+}}, eq + %res = cmpxchg i8* %addr, i8 %desired, i8 %new seq_cst monotonic + ret { i8, i1 } %res +} + +define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind { +; CHECK-LABEL: test_cmpxchg_16: +; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxrh [[OLD:w[0-9]+]], [x0] +; CHECK: cmp [[OLD]], w1, uxth +; CHECK: b.ne [[DONE:.LBB[0-9]+_[0-9]+]] +; CHECK: stlxrh [[STATUS:w[3-9]]], w2, [x0] +; CHECK: cbnz [[STATUS]], [[RETRY]] +; CHECK: [[DONE]]: +; CHECK: subs {{w[0-9]+}}, [[OLD]], w1 +; CHECK: cset {{w[0-9]+}}, eq + %res = cmpxchg i16* %addr, i16 %desired, i16 %new seq_cst monotonic + ret { i16, i1 } %res +} + +define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind { +; CHECK-LABEL: test_cmpxchg_32: +; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxr [[OLD:w[0-9]+]], [x0] +; CHECK: cmp [[OLD]], w1 +; CHECK: b.ne [[DONE:.LBB[0-9]+_[0-9]+]] +; CHECK: stlxr [[STATUS:w[3-9]]], w2, [x0] +; CHECK: cbnz [[STATUS]], [[RETRY]] +; CHECK: [[DONE]]: +; CHECK: subs {{w[0-9]+}}, [[OLD]], w1 +; CHECK: cset {{w[0-9]+}}, eq + %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic + ret { i32, i1 } %res +} + +define { i64, i1 } @test_cmpxchg_64(i64* %addr, i64 %desired, i64 %new) nounwind { +; CHECK-LABEL: test_cmpxchg_64: +; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxr [[OLD:x[0-9]+]], [x0] +; CHECK: cmp [[OLD]], x1 +; CHECK: b.ne [[DONE:.LBB[0-9]+_[0-9]+]] +; CHECK: stlxr [[STATUS:w[3-9]]], x2, [x0] +; CHECK: cbnz [[STATUS]], [[RETRY]] +; CHECK: [[DONE]]: +; CHECK: subs {{x[0-9]+}}, [[OLD]], x1 +; CHECK: cset {{w[0-9]+}}, eq + %res = cmpxchg i64* %addr, i64 %desired, i64 %new seq_cst monotonic + ret { i64, i1 } %res +} + +define { i128, i1 } @test_cmpxchg_128(i128* %addr, i128 %desired, i128 %new) nounwind { +; CHECK-LABEL: test_cmpxchg_128: +; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxp [[OLD_LO:x[0-9]+]], [[OLD_HI:x[0-9]+]], [x0] +; CHECK: cmp [[OLD_LO]], x2 +; CHECK: sbcs xzr, [[OLD_HI]], x3 +; CHECK: b.ne [[DONE:.LBB[0-9]+_[0-9]+]] +; CHECK: stlxp [[STATUS:w[0-9]+]], x4, x5, [x0] +; CHECK: cbnz [[STATUS]], [[RETRY]] +; CHECK: [[DONE]]: + %res = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst monotonic + ret { i128, i1 } %res +}