llvm_unreachable("Masked atomicrmw expansion unimplemented on this target");
}
+ /// Perform a atomicrmw expansion using a target-specific way. This is
+ /// expected to be called when masked atomicrmw and bit test atomicrmw don't
+ /// work, and the target supports another way to lower atomicrmw.
+ virtual void emitExpandAtomicRMW(AtomicRMWInst *AI) const {
+ llvm_unreachable(
+ "Generic atomicrmw expansion unimplemented on this target");
+ }
+
/// Perform a bit test atomicrmw using a target-specific intrinsic. This
/// represents the combined bit test intrinsic which will be lowered at a late
/// stage by the backend.
}
case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
return lowerAtomicRMWInst(AI);
+ case TargetLoweringBase::AtomicExpansionKind::Expand:
+ TLI->emitExpandAtomicRMW(AI);
+ return true;
default:
llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
}
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
if (Ty->isDoubleTy() && Subtarget->hasGFX90AInsts())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ // If it is in flat address space, and the type is float, we will try to
+ // expand it, if the target supports global and lds atomic fadd. The
+ // reason we need that is, in the expansion, we emit the check of address
+ // space. If it is in global address space, we emit the global atomic
+ // fadd; if it is in shared address space, we emit the LDS atomic fadd.
+ if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
+ Subtarget->hasLDSFPAtomicAdd()) {
+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
+ return AtomicExpansionKind::Expand;
+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
+ return AtomicExpansionKind::Expand;
+ }
+
return AtomicExpansionKind::CmpXChg;
}
}
return false;
}
+
+void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
+ assert(Subtarget->hasAtomicFaddInsts() &&
+ "target should have atomic fadd instructions");
+ assert(AI->getType()->isFloatTy() &&
+ AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
+ "generic atomicrmw expansion only supports FP32 operand in flat "
+ "address space");
+ assert(AI->getOperation() == AtomicRMWInst::FAdd &&
+ "only fadd is supported for now");
+
+ // Given: atomicrmw fadd float* %addr, float %val ordering
+ //
+ // With this expansion we produce the following code:
+ // [...]
+ // %int8ptr = bitcast float* %addr to i8*
+ // br label %atomicrmw.check.shared
+ //
+ // atomicrmw.check.shared:
+ // %is.shared = call i1 @llvm.amdgcn.is.shared(i8* %int8ptr)
+ // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
+ //
+ // atomicrmw.shared:
+ // %cast.shared = addrspacecast float* %addr to float addrspace(3)*
+ // %loaded.shared = atomicrmw fadd float addrspace(3)* %cast.shared,
+ // float %val ordering
+ // br label %atomicrmw.phi
+ //
+ // atomicrmw.check.private:
+ // %is.private = call i1 @llvm.amdgcn.is.private(i8* %int8ptr)
+ // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
+ //
+ // atomicrmw.private:
+ // %cast.private = addrspacecast float* %addr to float addrspace(5)*
+ // %loaded.private = load float, float addrspace(5)* %cast.private
+ // %val.new = fadd float %loaded.private, %val
+ // store float %val.new, float addrspace(5)* %cast.private
+ // br label %atomicrmw.phi
+ //
+ // atomicrmw.global:
+ // %cast.global = addrspacecast float* %addr to float addrspace(1)*
+ // %loaded.global = atomicrmw fadd float addrspace(1)* %cast.global,
+ // float %val ordering
+ // br label %atomicrmw.phi
+ //
+ // atomicrmw.phi:
+ // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
+ // [ %loaded.private, %atomicrmw.private ],
+ // [ %loaded.global, %atomicrmw.global ]
+ // br label %atomicrmw.end
+ //
+ // atomicrmw.end:
+ // [...]
+
+ IRBuilder<> Builder(AI);
+ LLVMContext &Ctx = Builder.getContext();
+
+ BasicBlock *BB = Builder.GetInsertBlock();
+ Function *F = BB->getParent();
+ BasicBlock *ExitBB =
+ BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
+ BasicBlock *CheckSharedBB =
+ BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
+ BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
+ BasicBlock *CheckPrivateBB =
+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
+ BasicBlock *PrivateBB =
+ BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
+ BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
+ BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
+
+ Value *Val = AI->getValOperand();
+ Type *ValTy = Val->getType();
+ Value *Addr = AI->getPointerOperand();
+ PointerType *PtrTy = cast<PointerType>(Addr->getType());
+
+ auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
+ Value *Val) -> Value * {
+ AtomicRMWInst *OldVal =
+ Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
+ AI->getOrdering(), AI->getSyncScopeID());
+ SmallVector<std::pair<unsigned, MDNode *>> MDs;
+ AI->getAllMetadata(MDs);
+ for (auto &P : MDs)
+ OldVal->setMetadata(P.first, P.second);
+ return OldVal;
+ };
+
+ std::prev(BB->end())->eraseFromParent();
+ Builder.SetInsertPoint(BB);
+ Value *Int8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy());
+ Builder.CreateBr(CheckSharedBB);
+
+ Builder.SetInsertPoint(CheckSharedBB);
+ CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
+ {Int8Ptr}, nullptr, "is.shared");
+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
+
+ Builder.SetInsertPoint(SharedBB);
+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
+ Addr,
+ PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::LOCAL_ADDRESS));
+ Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
+ Builder.CreateBr(PhiBB);
+
+ Builder.SetInsertPoint(CheckPrivateBB);
+ CallInst *IsPrivate = Builder.CreateIntrinsic(
+ Intrinsic::amdgcn_is_private, {}, {Int8Ptr}, nullptr, "is.private");
+ Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
+
+ Builder.SetInsertPoint(PrivateBB);
+ Value *CastToPrivate = Builder.CreateAddrSpaceCast(
+ Addr,
+ PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::PRIVATE_ADDRESS));
+ Value *LoadedPrivate =
+ Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
+ Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
+ Builder.CreateStore(NewVal, CastToPrivate);
+ Builder.CreateBr(PhiBB);
+
+ Builder.SetInsertPoint(GlobalBB);
+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
+ Addr,
+ PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::GLOBAL_ADDRESS));
+ Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
+ Builder.CreateBr(PhiBB);
+
+ Builder.SetInsertPoint(PhiBB);
+ PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
+ Loaded->addIncoming(LoadedShared, SharedBB);
+ Loaded->addIncoming(LoadedPrivate, PrivateBB);
+ Loaded->addIncoming(LoadedGlobal, GlobalBB);
+ Builder.CreateBr(ExitBB);
+
+ AI->replaceAllUsesWith(Loaded);
+ AI->eraseFromParent();
+}
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
AtomicExpansionKind
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+ void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
const TargetRegisterClass *getRegClassFor(MVT VT,
bool isDivergent) const override;
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1100 %s
+
+define float @syncscope_system(float* %addr, float %val) #0 {
+; GFX908-LABEL: syncscope_system:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1_vol
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB0_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: syncscope_system:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: syncscope_system:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB0_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: syncscope_system:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: flat_load_b32 v3, v[0:1]
+; GFX1100-NEXT: s_mov_b32 s0, 0
+; GFX1100-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_mov_b32_e32 v4, v3
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: buffer_gl0_inv
+; GFX1100-NEXT: buffer_gl1_inv
+; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1100-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1100-NEXT: s_cbranch_execnz .LBB0_1
+; GFX1100-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1100-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+ %res = atomicrmw fadd float* %addr, float %val seq_cst
+ ret float %res
+}
+
+define float @syncscope_workgroup_rtn(float* %addr, float %val) #0 {
+; GFX908-LABEL: syncscope_workgroup_rtn:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB1_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: syncscope_workgroup_rtn:
+; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
+; GFX90A-NEXT: s_lshl_b32 s4, s4, 16
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr3
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB1_6
+; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_getreg_b32 s6, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
+; GFX90A-NEXT: s_lshl_b32 s6, s6, 16
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s6, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr3
+; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB1_3
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB1_3: ; %Flow
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB1_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: .LBB1_5: ; %Flow1
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB1_6: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB1_8
+; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: .LBB1_8: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: syncscope_workgroup_rtn:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: syncscope_workgroup_rtn:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc
+; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: buffer_gl0_inv
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+ %res = atomicrmw fadd float* %addr, float %val syncscope("workgroup") seq_cst
+ ret float %res
+}
+
+define void @syncscope_workgroup_nortn(float* %addr, float %val) #0 {
+; GFX908-LABEL: syncscope_workgroup_nortn:
+; GFX908: ; %bb.0: ; %atomicrmw.check.shared
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
+; GFX908-NEXT: s_lshl_b32 s4, s4, 16
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB2_3
+; GFX908-NEXT: ; %bb.1: ; %Flow2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB2_8
+; GFX908-NEXT: .LBB2_2: ; %atomicrmw.phi
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+; GFX908-NEXT: .LBB2_3: ; %atomicrmw.check.private
+; GFX908-NEXT: s_getreg_b32 s6, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
+; GFX908-NEXT: s_lshl_b32 s6, s6, 16
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s6, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB2_5
+; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: .LBB2_5: ; %Flow
+; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB2_7
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: .LBB2_7: ; %Flow1
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execz .LBB2_2
+; GFX908-NEXT: .LBB2_8: ; %atomicrmw.shared
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: ds_add_f32 v0, v2
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: syncscope_workgroup_nortn:
+; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
+; GFX90A-NEXT: s_lshl_b32 s4, s4, 16
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB2_3
+; GFX90A-NEXT: ; %bb.1: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB2_8
+; GFX90A-NEXT: .LBB2_2: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_getreg_b32 s6, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
+; GFX90A-NEXT: s_lshl_b32 s6, s6, 16
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s6, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB2_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB2_5: ; %Flow
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB2_7
+; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: .LBB2_7: ; %Flow1
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB2_2
+; GFX90A-NEXT: .LBB2_8: ; %atomicrmw.shared
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: ds_add_f32 v0, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: syncscope_workgroup_nortn:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: syncscope_workgroup_nortn:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: buffer_gl0_inv
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+ %res = atomicrmw fadd float* %addr, float %val syncscope("workgroup") seq_cst
+ ret void
+}
+
+define float @no_unsafe(float* %addr, float %val) {
+; GFX908-LABEL: no_unsafe:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB3_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: no_unsafe:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB3_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: no_unsafe:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB3_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: no_unsafe:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: flat_load_b32 v3, v[0:1]
+; GFX1100-NEXT: s_mov_b32 s0, 0
+; GFX1100-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_mov_b32_e32 v4, v3
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: buffer_gl0_inv
+; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1100-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1100-NEXT: s_cbranch_execnz .LBB3_1
+; GFX1100-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1100-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+ %res = atomicrmw fadd float* %addr, float %val syncscope("workgroup") seq_cst
+ ret float %res
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -atomic-expand %s | FileCheck -check-prefix=GFX908 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -atomic-expand %s | FileCheck -check-prefix=GFX90A %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -atomic-expand %s | FileCheck -check-prefix=GFX940 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -atomic-expand %s | FileCheck -check-prefix=GFX1100 %s
+
+define float @syncscope_system(float* %addr, float %val) #0 {
+; GFX908-LABEL: @syncscope_system(
+; GFX908-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32*
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: @syncscope_system(
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32*
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: @syncscope_system(
+; GFX940-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
+; GFX940-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32*
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX1100-LABEL: @syncscope_system(
+; GFX1100-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4
+; GFX1100-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX1100: atomicrmw.start:
+; GFX1100-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX1100-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
+; GFX1100-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32*
+; GFX1100-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX1100-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX1100-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX1100-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX1100-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX1100-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX1100-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX1100: atomicrmw.end:
+; GFX1100-NEXT: ret float [[TMP6]]
+;
+; GFX11-LABEL: @syncscope_system(
+; GFX11-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32*
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[TMP6]]
+ %res = atomicrmw fadd float* %addr, float %val seq_cst
+ ret float %res
+}
+
+define float @syncscope_workgroup_rtn(float* %addr, float %val) #0 {
+; GFX908-LABEL: @syncscope_workgroup_rtn(
+; GFX908-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32*
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: @syncscope_workgroup_rtn(
+; GFX90A-NEXT: [[TMP1:%.*]] = bitcast float* [[ADDR:%.*]] to i8*
+; GFX90A-NEXT: br label [[ATOMICRMW_CHECK_SHARED:%.*]]
+; GFX90A: atomicrmw.check.shared:
+; GFX90A-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(i8* [[TMP1]])
+; GFX90A-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
+; GFX90A: atomicrmw.shared:
+; GFX90A-NEXT: [[TMP2:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(3)*
+; GFX90A-NEXT: [[TMP3:%.*]] = atomicrmw fadd float addrspace(3)* [[TMP2]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]]
+; GFX90A: atomicrmw.check.private:
+; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(i8* [[TMP1]])
+; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]]
+; GFX90A: atomicrmw.private:
+; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(5)*
+; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load float, float addrspace(5)* [[TMP4]], align 4
+; GFX90A-NEXT: [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
+; GFX90A-NEXT: store float [[VAL_NEW]], float addrspace(5)* [[TMP4]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_PHI]]
+; GFX90A: atomicrmw.global:
+; GFX90A-NEXT: [[TMP5:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(1)*
+; GFX90A-NEXT: [[TMP6:%.*]] = atomicrmw fadd float addrspace(1)* [[TMP5]], float [[VAL]] syncscope("workgroup") seq_cst, align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_PHI]]
+; GFX90A: atomicrmw.phi:
+; GFX90A-NEXT: [[LOADED_PHI:%.*]] = phi float [ [[TMP3]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_GLOBAL]] ]
+; GFX90A-NEXT: br label [[ATOMICRMW_END:%.*]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[LOADED_PHI]]
+;
+; GFX940-LABEL: @syncscope_workgroup_rtn(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float* [[ADDR:%.*]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4
+; GFX940-NEXT: ret float [[RES]]
+;
+; GFX1100-LABEL: @syncscope_workgroup_rtn(
+; GFX1100-NEXT: [[RES:%.*]] = atomicrmw fadd float* [[ADDR:%.*]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4
+; GFX1100-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: @syncscope_workgroup_rtn(
+; GFX11-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32*
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[TMP6]]
+ %res = atomicrmw fadd float* %addr, float %val syncscope("workgroup") seq_cst
+ ret float %res
+}
+
+define void @syncscope_workgroup_nortn(float* %addr, float %val) #0 {
+; GFX908-LABEL: @syncscope_workgroup_nortn(
+; GFX908-NEXT: [[TMP1:%.*]] = bitcast float* [[ADDR:%.*]] to i8*
+; GFX908-NEXT: br label [[ATOMICRMW_CHECK_SHARED:%.*]]
+; GFX908: atomicrmw.check.shared:
+; GFX908-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(i8* [[TMP1]])
+; GFX908-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
+; GFX908: atomicrmw.shared:
+; GFX908-NEXT: [[TMP2:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(3)*
+; GFX908-NEXT: [[TMP3:%.*]] = atomicrmw fadd float addrspace(3)* [[TMP2]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4
+; GFX908-NEXT: br label [[ATOMICRMW_PHI:%.*]]
+; GFX908: atomicrmw.check.private:
+; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(i8* [[TMP1]])
+; GFX908-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]]
+; GFX908: atomicrmw.private:
+; GFX908-NEXT: [[TMP4:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(5)*
+; GFX908-NEXT: [[LOADED_PRIVATE:%.*]] = load float, float addrspace(5)* [[TMP4]], align 4
+; GFX908-NEXT: [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
+; GFX908-NEXT: store float [[VAL_NEW]], float addrspace(5)* [[TMP4]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_PHI]]
+; GFX908: atomicrmw.global:
+; GFX908-NEXT: [[TMP5:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(1)*
+; GFX908-NEXT: [[TMP6:%.*]] = atomicrmw fadd float addrspace(1)* [[TMP5]], float [[VAL]] syncscope("workgroup") seq_cst, align 4
+; GFX908-NEXT: br label [[ATOMICRMW_PHI]]
+; GFX908: atomicrmw.phi:
+; GFX908-NEXT: [[LOADED_PHI:%.*]] = phi float [ [[TMP3]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_GLOBAL]] ]
+; GFX908-NEXT: br label [[ATOMICRMW_END:%.*]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret void
+;
+; GFX90A-LABEL: @syncscope_workgroup_nortn(
+; GFX90A-NEXT: [[TMP1:%.*]] = bitcast float* [[ADDR:%.*]] to i8*
+; GFX90A-NEXT: br label [[ATOMICRMW_CHECK_SHARED:%.*]]
+; GFX90A: atomicrmw.check.shared:
+; GFX90A-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(i8* [[TMP1]])
+; GFX90A-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
+; GFX90A: atomicrmw.shared:
+; GFX90A-NEXT: [[TMP2:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(3)*
+; GFX90A-NEXT: [[TMP3:%.*]] = atomicrmw fadd float addrspace(3)* [[TMP2]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]]
+; GFX90A: atomicrmw.check.private:
+; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(i8* [[TMP1]])
+; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]]
+; GFX90A: atomicrmw.private:
+; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(5)*
+; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load float, float addrspace(5)* [[TMP4]], align 4
+; GFX90A-NEXT: [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
+; GFX90A-NEXT: store float [[VAL_NEW]], float addrspace(5)* [[TMP4]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_PHI]]
+; GFX90A: atomicrmw.global:
+; GFX90A-NEXT: [[TMP5:%.*]] = addrspacecast float* [[ADDR]] to float addrspace(1)*
+; GFX90A-NEXT: [[TMP6:%.*]] = atomicrmw fadd float addrspace(1)* [[TMP5]], float [[VAL]] syncscope("workgroup") seq_cst, align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_PHI]]
+; GFX90A: atomicrmw.phi:
+; GFX90A-NEXT: [[LOADED_PHI:%.*]] = phi float [ [[TMP3]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_GLOBAL]] ]
+; GFX90A-NEXT: br label [[ATOMICRMW_END:%.*]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret void
+;
+; GFX940-LABEL: @syncscope_workgroup_nortn(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float* [[ADDR:%.*]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4
+; GFX940-NEXT: ret void
+;
+; GFX1100-LABEL: @syncscope_workgroup_nortn(
+; GFX1100-NEXT: [[RES:%.*]] = atomicrmw fadd float* [[ADDR:%.*]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4
+; GFX1100-NEXT: ret void
+;
+; GFX11-LABEL: @syncscope_workgroup_nortn(
+; GFX11-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32*
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret void
+ %res = atomicrmw fadd float* %addr, float %val syncscope("workgroup") seq_cst
+ ret void
+}
+
+define float @no_unsafe(float* %addr, float %val) {
+; GFX908-LABEL: @no_unsafe(
+; GFX908-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32*
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: @no_unsafe(
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32*
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: @no_unsafe(
+; GFX940-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
+; GFX940-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32*
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX1100-LABEL: @no_unsafe(
+; GFX1100-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4
+; GFX1100-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX1100: atomicrmw.start:
+; GFX1100-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX1100-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
+; GFX1100-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32*
+; GFX1100-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX1100-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX1100-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4
+; GFX1100-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX1100-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX1100-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX1100-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX1100: atomicrmw.end:
+; GFX1100-NEXT: ret float [[TMP6]]
+;
+; GFX11-LABEL: @no_unsafe(
+; GFX11-NEXT: [[TMP1:%.*]] = load float, float* [[ADDR:%.*]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast float* [[ADDR]] to i32*
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("workgroup") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[TMP6]]
+ %res = atomicrmw fadd float* %addr, float %val syncscope("workgroup") seq_cst
+ ret float %res
+}
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: @test_atomicrmw_fadd_f32_flat_unsafe(
-; GFX90A-NEXT: [[TMP1:%.*]] = load float, float* [[PTR:%.*]], align 4
-; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GFX90A: atomicrmw.start:
-; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float* [[PTR]] to i32*
-; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 4
-; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: [[TMP1:%.*]] = bitcast float* [[PTR:%.*]] to i8*
+; GFX90A-NEXT: br label [[ATOMICRMW_CHECK_SHARED:%.*]]
+; GFX90A: atomicrmw.check.shared:
+; GFX90A-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(i8* [[TMP1]])
+; GFX90A-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
+; GFX90A: atomicrmw.shared:
+; GFX90A-NEXT: [[TMP2:%.*]] = addrspacecast float* [[PTR]] to float addrspace(3)*
+; GFX90A-NEXT: [[TMP3:%.*]] = atomicrmw fadd float addrspace(3)* [[TMP2]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]]
+; GFX90A: atomicrmw.check.private:
+; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(i8* [[TMP1]])
+; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]]
+; GFX90A: atomicrmw.private:
+; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast float* [[PTR]] to float addrspace(5)*
+; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load float, float addrspace(5)* [[TMP4]], align 4
+; GFX90A-NEXT: [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VALUE]]
+; GFX90A-NEXT: store float [[VAL_NEW]], float addrspace(5)* [[TMP4]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_PHI]]
+; GFX90A: atomicrmw.global:
+; GFX90A-NEXT: [[TMP5:%.*]] = addrspacecast float* [[PTR]] to float addrspace(1)*
+; GFX90A-NEXT: [[TMP6:%.*]] = atomicrmw fadd float addrspace(1)* [[TMP5]], float [[VALUE]] syncscope("wavefront") monotonic, align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_PHI]]
+; GFX90A: atomicrmw.phi:
+; GFX90A-NEXT: [[LOADED_PHI:%.*]] = phi float [ [[TMP3]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_GLOBAL]] ]
+; GFX90A-NEXT: br label [[ATOMICRMW_END:%.*]]
; GFX90A: atomicrmw.end:
-; GFX90A-NEXT: ret float [[TMP6]]
+; GFX90A-NEXT: ret float [[LOADED_PHI]]
;
; GFX940-LABEL: @test_atomicrmw_fadd_f32_flat_unsafe(
; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float* [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4
; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 4
; GFX908-NEXT: ret half [[RES]]
;
+; GFX90A-LABEL: @test_atomicrmw_fadd_f16_global_align4(
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 4
+; GFX90A-NEXT: ret half [[RES]]
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_f16_global_align4(
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 4
+; GFX940-NEXT: ret half [[RES]]
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_f16_global_align4(
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst, align 4
+; GFX11-NEXT: ret half [[RES]]
+;
%res = atomicrmw fadd half addrspace(1)* %ptr, half %value seq_cst, align 4
ret half %res
}