From 21a69bdb66e3d77b066a06dd4d4c3ff8a2dd8daa Mon Sep 17 00:00:00 2001 From: Pravin Jagtap Date: Thu, 20 Apr 2023 00:27:47 -0400 Subject: [PATCH] [NewPM][AMDGPU] Port amdgpu-atomic-optimizer Reviewed By: arsenm, sameerds, gandhi21299 Differential Revision: https://reviews.llvm.org/D148628 --- llvm/lib/Target/AMDGPU/AMDGPU.h | 8 + llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 98 +- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 + llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll | 1440 ++++++++++++++++++++++ 4 files changed, 1521 insertions(+), 29 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 10a7f6c..b1a94b7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -233,6 +233,14 @@ private: TargetMachine &TM; }; +struct AMDGPUAtomicOptimizerPass : PassInfoMixin { + AMDGPUAtomicOptimizerPass(TargetMachine &TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + +private: + TargetMachine &TM; +}; + Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &TM, CodeGenOpt::Level OptLevel); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index e4ecfc7..13eaa7b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -38,8 +38,23 @@ struct ReplacementInfo { bool ValDivergent; }; -class AMDGPUAtomicOptimizer : public FunctionPass, - public InstVisitor { +class AMDGPUAtomicOptimizer : public FunctionPass { +public: + static char ID; + + AMDGPUAtomicOptimizer() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); + } +}; + +class AMDGPUAtomicOptimizerImpl + : public InstVisitor { private: SmallVector ToReplace; const UniformityInfo *UA; @@ -53,21 +68,19 @@ private: Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; + void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, bool ValDivergent) const; public: - static char ID; - - AMDGPUAtomicOptimizer() : FunctionPass(ID) {} + AMDGPUAtomicOptimizerImpl() = delete; - bool runOnFunction(Function &F) override; + AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL, + DominatorTree *DT, const GCNSubtarget *ST, + bool IsPixelShader) + : UA(UA), DL(DL), DT(DT), ST(ST), IsPixelShader(IsPixelShader) {} - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addPreserved(); - AU.addRequired(); - AU.addRequired(); - } + bool run(Function &F); void visitAtomicRMWInst(AtomicRMWInst &I); void visitIntrinsicInst(IntrinsicInst &I); @@ -84,15 +97,40 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { return false; } - UA = &getAnalysis().getUniformityInfo(); - DL = &F.getParent()->getDataLayout(); + const UniformityInfo *UA = + &getAnalysis().getUniformityInfo(); + const DataLayout *DL = &F.getParent()->getDataLayout(); + DominatorTreeWrapperPass *const DTW = getAnalysisIfAvailable(); - DT = DTW ? &DTW->getDomTree() : nullptr; + DominatorTree *DT = DTW ? &DTW->getDomTree() : nullptr; + const TargetPassConfig &TPC = getAnalysis(); const TargetMachine &TM = TPC.getTM(); - ST = &TM.getSubtarget(F); - IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; + const GCNSubtarget *ST = &TM.getSubtarget(F); + + bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; + + return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader).run(F); +} + +PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, + FunctionAnalysisManager &AM) { + + const auto *UA = &AM.getResult(F); + const DataLayout *DL = &F.getParent()->getDataLayout(); + + DominatorTree *DT = &AM.getResult(F); + const GCNSubtarget *ST = &TM.getSubtarget(F); + + bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; + + return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader).run(F) + ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} + +bool AMDGPUAtomicOptimizerImpl::run(Function &F) { visit(F); @@ -107,7 +145,7 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { return Changed; } -void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { +void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { // Early exit for unhandled address space atomic instructions. switch (I.getPointerAddressSpace()) { default: @@ -162,7 +200,7 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { ToReplace.push_back(Info); } -void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { +void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { AtomicRMWInst::BinOp Op; switch (I.getIntrinsicID()) { @@ -283,9 +321,10 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, // Use the builder to create a reduction of V across the wavefront, with all // lanes active, returning the same result in all lanes. -Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B, - AtomicRMWInst::BinOp Op, Value *V, - Value *const Identity) const { +Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, + AtomicRMWInst::BinOp Op, + Value *V, + Value *const Identity) const { Type *const Ty = V->getType(); Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = @@ -328,8 +367,9 @@ Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B, // Use the builder to create an inclusive scan of V across the wavefront, with // all lanes active. -Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, - Value *V, Value *const Identity) const { +Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, + AtomicRMWInst::BinOp Op, Value *V, + Value *const Identity) const { Type *const Ty = V->getType(); Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = @@ -385,8 +425,8 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, // Use the builder to create a shift right of V across the wavefront, with all // lanes active, to turn an inclusive scan into an exclusive scan. -Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, - Value *const Identity) const { +Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, + Value *const Identity) const { Type *const Ty = V->getType(); Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = @@ -456,10 +496,10 @@ static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); } -void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, - AtomicRMWInst::BinOp Op, - unsigned ValIdx, - bool ValDivergent) const { +void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, + AtomicRMWInst::BinOp Op, + unsigned ValIdx, + bool ValDivergent) const { // Start building just before the instruction. IRBuilder<> B(&I); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 410cd4e..94c3cc3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -650,6 +650,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(AMDGPUUnifyDivergentExitNodesPass()); return true; } + if (PassName == "amdgpu-atomic-optimizer") { + PM.addPass(AMDGPUAtomicOptimizerPass(*this)); + return true; + } return false; }); diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll new file mode 100644 index 0000000..3a63629 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll @@ -0,0 +1,1440 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s + +define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_add_i32_offset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_add_i32_max_neg_offset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 -1024 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024 + %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_add_i32_soffset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 9000 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000 + %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_add_i32_huge_offset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 47224239175595 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595 + + %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { +; IR-LABEL: @atomic_add_i32_ret_offset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] +; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { +; IR-LABEL: @atomic_add_i32_addr64_offset( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 + %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { +; IR-LABEL: @atomic_add_i32_ret_addr64_offset( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] +; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 + %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_add_i32( +; IR-NEXT: entry: +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: ret void +; +entry: + %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { +; IR-LABEL: @atomic_add_i32_ret( +; IR-NEXT: entry: +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] +; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { +; IR-LABEL: @atomic_add_i32_addr64( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { +; IR-LABEL: @atomic_add_i32_ret_addr64( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] +; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_and_i32_offset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { +; IR-LABEL: @atomic_and_i32_ret_offset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] +; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { +; IR-LABEL: @atomic_and_i32_addr64_offset( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 + %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { +; IR-LABEL: @atomic_and_i32_ret_addr64_offset( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] +; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 + %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_and_i32( +; IR-NEXT: entry: +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { +; IR-LABEL: @atomic_and_i32_ret( +; IR-NEXT: entry: +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] +; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { +; IR-LABEL: @atomic_and_i32_addr64( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { +; IR-LABEL: @atomic_and_i32_ret_addr64( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] +; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_sub_i32_offset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { +; IR-LABEL: @atomic_sub_i32_ret_offset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] +; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { +; IR-LABEL: @atomic_sub_i32_addr64_offset( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 + %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { +; IR-LABEL: @atomic_sub_i32_ret_addr64_offset( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] +; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 + %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_sub_i32( +; IR-NEXT: entry: +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: ret void +; +entry: + %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { +; IR-LABEL: @atomic_sub_i32_ret( +; IR-NEXT: entry: +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] +; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { +; IR-LABEL: @atomic_sub_i32_addr64( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { +; IR-LABEL: @atomic_sub_i32_ret_addr64( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) +; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] +; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_max_i32_offset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { +; IR-LABEL: @atomic_max_i32_ret_offset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] +; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { +; IR-LABEL: @atomic_max_i32_addr64_offset( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 + %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { +; IR-LABEL: @atomic_max_i32_ret_addr64_offset( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] +; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 + %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_max_i32( +; IR-NEXT: entry: +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { +; IR-LABEL: @atomic_max_i32_ret( +; IR-NEXT: entry: +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] +; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { +; IR-LABEL: @atomic_max_i32_addr64( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { +; IR-LABEL: @atomic_max_i32_ret_addr64( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] +; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_umax_i32_offset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { +; IR-LABEL: @atomic_umax_i32_ret_offset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] +; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { +; IR-LABEL: @atomic_umax_i32_addr64_offset( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 + %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { +; IR-LABEL: @atomic_umax_i32_ret_addr64_offset( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] +; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 + %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_umax_i32( +; IR-NEXT: entry: +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { +; IR-LABEL: @atomic_umax_i32_ret( +; IR-NEXT: entry: +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] +; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { +; IR-LABEL: @atomic_umax_i32_addr64( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { +; IR-LABEL: @atomic_umax_i32_ret_addr64( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] +; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_min_i32_offset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { +; IR-LABEL: @atomic_min_i32_ret_offset( +; IR-NEXT: entry: +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] +; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { +; IR-LABEL: @atomic_min_i32_addr64_offset( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 + %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { +; IR-LABEL: @atomic_min_i32_ret_addr64_offset( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] +; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 + %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { +; IR-LABEL: @atomic_min_i32( +; IR-NEXT: entry: +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { +; IR-LABEL: @atomic_min_i32_ret( +; IR-NEXT: entry: +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] +; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { +; IR-LABEL: @atomic_min_i32_addr64( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { +; IR-LABEL: @atomic_min_i32_ret_addr64( +; IR-NEXT: entry: +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: br label [[TMP9]] +; IR: 9: +; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] +; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] +; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: ret void +; +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index + %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst + store i32 %val, ptr addrspace(1) %out2 + ret void +} -- 2.7.4