From a116602475314e0b3e7e4e28bd7057df1937b761 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 27 Nov 2018 21:24:54 +0000 Subject: [PATCH] [OPENMP][NVPTX]Basic support for reductions across the teams. Added basic codegen support for the reductions across the teams. llvm-svn: 347715 --- clang/lib/CodeGen/CGOpenMPRuntime.h | 17 +- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 480 ++------ .../test/OpenMP/nvptx_teams_reduction_codegen.cpp | 1203 +++----------------- 3 files changed, 249 insertions(+), 1451 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index 489ecdf..d332623 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -290,6 +290,16 @@ protected: /// default location. virtual unsigned getDefaultLocationReserved2Flags() const { return 0; } + /// Get the LLVM type for the critical name. + llvm::ArrayType *getKmpCriticalNameTy() const {return KmpCriticalNameTy;} + + /// Returns corresponding lock object for the specified critical region + /// name. If the lock object does not exist it is created, otherwise the + /// reference to the existing copy is returned. + /// \param CriticalName Name of the critical region. + /// + llvm::Value *getCriticalRegionLock(StringRef CriticalName); + private: /// Default const ident_t object used for initialization of all other /// ident_t objects. @@ -707,13 +717,6 @@ private: llvm::Value *Ctor, llvm::Value *CopyCtor, llvm::Value *Dtor, SourceLocation Loc); - /// Returns corresponding lock object for the specified critical region - /// name. If the lock object does not exist it is created, otherwise the - /// reference to the existing copy is returned. - /// \param CriticalName Name of the critical region. - /// - llvm::Value *getCriticalRegionLock(StringRef CriticalName); - struct TaskResultTy { llvm::Value *NewTask = nullptr; llvm::Value *TaskEntry = nullptr; diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 3bd8812..def3ba7 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -62,22 +62,12 @@ enum OpenMPRTLFunctionNVPTX { /// lane_offset, int16_t shortCircuit), /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num)); OMPRTL_NVPTX__kmpc_parallel_reduce_nowait, - /// Call to __kmpc_nvptx_simd_reduce_nowait(kmp_int32 - /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data, - /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t - /// lane_offset, int16_t shortCircuit), - /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num)); - OMPRTL_NVPTX__kmpc_simd_reduce_nowait, - /// Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, - /// int32_t num_vars, size_t reduce_size, void *reduce_data, - /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t - /// lane_offset, int16_t shortCircuit), - /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), - /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad, - /// int32_t index, int32_t width), - /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t - /// index, int32_t width, int32_t reduce)) - OMPRTL_NVPTX__kmpc_teams_reduce_nowait, + /// Call to __kmpc_nvptx_teams_reduce_nowait_simple(ident_t *loc, kmp_int32 + /// global_tid, kmp_critical_name *lck) + OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple, + /// Call to __kmpc_nvptx_teams_end_reduce_nowait_simple(ident_t *loc, + /// kmp_int32 global_tid, kmp_critical_name *lck) + OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple, /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); OMPRTL_NVPTX__kmpc_end_reduce_nowait, /// Call to void __kmpc_data_sharing_init_stack(); @@ -1703,83 +1693,37 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait"); break; } - case OMPRTL_NVPTX__kmpc_simd_reduce_nowait: { - // Build int32_t kmpc_nvptx_simd_reduce_nowait(kmp_int32 global_tid, - // kmp_int32 num_vars, size_t reduce_size, void* reduce_data, - // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t - // lane_offset, int16_t Algorithm Version), - // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num)); - llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, - CGM.Int16Ty, CGM.Int16Ty}; - auto *ShuffleReduceFnTy = - llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams, - /*isVarArg=*/false); - llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty}; - auto *InterWarpCopyFnTy = - llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, - /*isVarArg=*/false); - llvm::Type *TypeParams[] = {CGM.Int32Ty, - CGM.Int32Ty, - CGM.SizeTy, - CGM.VoidPtrTy, - ShuffleReduceFnTy->getPointerTo(), - InterWarpCopyFnTy->getPointerTo()}; + case OMPRTL_NVPTX__kmpc_end_reduce_nowait: { + // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid); + llvm::Type *TypeParams[] = {CGM.Int32Ty}; auto *FnTy = - llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_simd_reduce_nowait"); + FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait"); break; } - case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: { - // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, - // int32_t num_vars, size_t reduce_size, void *reduce_data, - // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t - // lane_offset, int16_t shortCircuit), - // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), - // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad, - // int32_t index, int32_t width), - // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, - // int32_t index, int32_t width, int32_t reduce)) - llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, - CGM.Int16Ty, CGM.Int16Ty}; - auto *ShuffleReduceFnTy = - llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams, - /*isVarArg=*/false); - llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty}; - auto *InterWarpCopyFnTy = - llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, - /*isVarArg=*/false); - llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy, - CGM.Int32Ty, CGM.Int32Ty}; - auto *CopyToScratchpadFnTy = - llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams, - /*isVarArg=*/false); - llvm::Type *LoadReduceTypeParams[] = { - CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty}; - auto *LoadReduceFnTy = - llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams, - /*isVarArg=*/false); - llvm::Type *TypeParams[] = {CGM.Int32Ty, - CGM.Int32Ty, - CGM.SizeTy, - CGM.VoidPtrTy, - ShuffleReduceFnTy->getPointerTo(), - InterWarpCopyFnTy->getPointerTo(), - CopyToScratchpadFnTy->getPointerTo(), - LoadReduceFnTy->getPointerTo()}; + case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple: { + // Build __kmpc_nvptx_teams_reduce_nowait_simple(ident_t *loc, kmp_int32 + // global_tid, kmp_critical_name *lck) + llvm::Type *TypeParams[] = { + getIdentTyPointerTy(), CGM.Int32Ty, + llvm::PointerType::getUnqual(getKmpCriticalNameTy())}; auto *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait"); + FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_simple"); break; } - case OMPRTL_NVPTX__kmpc_end_reduce_nowait: { - // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid); - llvm::Type *TypeParams[] = {CGM.Int32Ty}; + case OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple: { + // Build __kmpc_nvptx_teams_end_reduce_nowait_simple(ident_t *loc, kmp_int32 + // global_tid, kmp_critical_name *lck) + llvm::Type *TypeParams[] = { + getIdentTyPointerTy(), CGM.Int32Ty, + llvm::PointerType::getUnqual(getKmpCriticalNameTy())}; auto *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait"); + FnTy, /*Name=*/"__kmpc_nvptx_teams_end_reduce_nowait_simple"); break; } case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: { @@ -3126,222 +3070,6 @@ static void emitReductionListCopy( } } -/// This function emits a helper that loads data from the scratchpad array -/// and (optionally) reduces it with the input operand. -/// -/// load_and_reduce(local, scratchpad, index, width, should_reduce) -/// reduce_data remote; -/// for elem in remote: -/// remote.elem = Scratchpad[elem_id][index] -/// if (should_reduce) -/// local = local @ remote -/// else -/// local = remote -static llvm::Value *emitReduceScratchpadFunction( - CodeGenModule &CGM, ArrayRef Privates, - QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) { - ASTContext &C = CGM.getContext(); - QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1); - - // Destination of the copy. - ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); - // Base address of the scratchpad array, with each element storing a - // Reduce list per team. - ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); - // A source index into the scratchpad array. - ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty, - ImplicitParamDecl::Other); - // Row width of an element in the scratchpad array, typically - // the number of teams. - ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty, - ImplicitParamDecl::Other); - // If should_reduce == 1, then it's load AND reduce, - // If should_reduce == 0 (or otherwise), then it only loads (+ copy). - // The latter case is used for initialization. - ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - Int32Ty, ImplicitParamDecl::Other); - - FunctionArgList Args; - Args.push_back(&ReduceListArg); - Args.push_back(&ScratchPadArg); - Args.push_back(&IndexArg); - Args.push_back(&WidthArg); - Args.push_back(&ShouldReduceArg); - - const CGFunctionInfo &CGFI = - CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); - auto *Fn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - "_omp_reduction_load_and_reduce", &CGM.getModule()); - CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); - Fn->setDoesNotRecurse(); - CodeGenFunction CGF(CGM); - CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); - - CGBuilderTy &Bld = CGF.Builder; - - // Get local Reduce list pointer. - Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); - Address ReduceListAddr( - Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, - C.VoidPtrTy, Loc), - CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), - CGF.getPointerAlign()); - - Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg); - llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar( - AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc); - - Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg); - llvm::Value *IndexVal = Bld.CreateIntCast( - CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc), - CGM.SizeTy, /*isSigned=*/true); - - Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg); - llvm::Value *WidthVal = Bld.CreateIntCast( - CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc), - CGM.SizeTy, /*isSigned=*/true); - - Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg); - llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar( - AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, Loc); - - // The absolute ptr address to the base addr of the next element to copy. - llvm::Value *CumulativeElemBasePtr = - Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy); - Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign()); - - // Create a Remote Reduce list to store the elements read from the - // scratchpad array. - Address RemoteReduceList = - CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list"); - - // Assemble remote Reduce list from scratchpad array. - emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates, - SrcDataAddr, RemoteReduceList, - {/*RemoteLaneOffset=*/nullptr, - /*ScratchpadIndex=*/IndexVal, - /*ScratchpadWidth=*/WidthVal}); - - llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); - llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); - llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); - - llvm::Value *CondReduce = Bld.CreateIsNotNull(ShouldReduceVal); - Bld.CreateCondBr(CondReduce, ThenBB, ElseBB); - - CGF.EmitBlock(ThenBB); - // We should reduce with the local Reduce list. - // reduce_function(LocalReduceList, RemoteReduceList) - llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( - ReduceListAddr.getPointer(), CGF.VoidPtrTy); - llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( - RemoteReduceList.getPointer(), CGF.VoidPtrTy); - CGM.getOpenMPRuntime().emitOutlinedFunctionCall( - CGF, Loc, ReduceFn, {LocalDataPtr, RemoteDataPtr}); - Bld.CreateBr(MergeBB); - - CGF.EmitBlock(ElseBB); - // No reduction; just copy: - // Local Reduce list = Remote Reduce list. - emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates, - RemoteReduceList, ReduceListAddr); - Bld.CreateBr(MergeBB); - - CGF.EmitBlock(MergeBB); - - CGF.FinishFunction(); - return Fn; -} - -/// This function emits a helper that stores reduced data from the team -/// master to a scratchpad array in global memory. -/// -/// for elem in Reduce List: -/// scratchpad[elem_id][index] = elem -/// -static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM, - ArrayRef Privates, - QualType ReductionArrayTy, - SourceLocation Loc) { - - ASTContext &C = CGM.getContext(); - QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1); - - // Source of the copy. - ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); - // Base address of the scratchpad array, with each element storing a - // Reduce list per team. - ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); - // A destination index into the scratchpad array, typically the team - // identifier. - ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty, - ImplicitParamDecl::Other); - // Row width of an element in the scratchpad array, typically - // the number of teams. - ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty, - ImplicitParamDecl::Other); - - FunctionArgList Args; - Args.push_back(&ReduceListArg); - Args.push_back(&ScratchPadArg); - Args.push_back(&IndexArg); - Args.push_back(&WidthArg); - - const CGFunctionInfo &CGFI = - CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); - auto *Fn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - "_omp_reduction_copy_to_scratchpad", &CGM.getModule()); - CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); - Fn->setDoesNotRecurse(); - CodeGenFunction CGF(CGM); - CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); - - CGBuilderTy &Bld = CGF.Builder; - - Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); - Address SrcDataAddr( - Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, - C.VoidPtrTy, Loc), - CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), - CGF.getPointerAlign()); - - Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg); - llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar( - AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc); - - Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg); - llvm::Value *IndexVal = Bld.CreateIntCast( - CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc), - CGF.SizeTy, /*isSigned=*/true); - - Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg); - llvm::Value *WidthVal = Bld.CreateIntCast( - CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc), - CGF.SizeTy, /*isSigned=*/true); - - // The absolute ptr address to the base addr of the next element to copy. - llvm::Value *CumulativeElemBasePtr = - Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy); - Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign()); - - emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates, - SrcDataAddr, DestDataAddr, - {/*RemoteLaneOffset=*/nullptr, - /*ScratchpadIndex=*/IndexVal, - /*ScratchpadWidth=*/WidthVal}); - - CGF.FinishFunction(); - return Fn; -} - /// This function emits a helper that gathers Reduce lists from the first /// lane of every active warp to lanes in the first warp. /// @@ -4061,65 +3789,65 @@ void CGOpenMPRuntimeNVPTX::emitReduction( assert((TeamsReduction || ParallelReduction) && "Invalid reduction selection in emitReduction."); - ASTContext &C = CGM.getContext(); - - // 1. Build a list of reduction variables. - // void *RedList[] = {[0], ..., [-1]}; - auto Size = RHSExprs.size(); - for (const Expr *E : Privates) { - if (E->getType()->isVariablyModifiedType()) - // Reserve place for array size. - ++Size; - } - llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); - QualType ReductionArrayTy = - C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal, - /*IndexTypeQuals=*/0); - Address ReductionList = - CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); - auto IPriv = Privates.begin(); - unsigned Idx = 0; - for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) { - Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, - CGF.getPointerSize()); - CGF.Builder.CreateStore( - CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy), - Elem); - if ((*IPriv)->getType()->isVariablyModifiedType()) { - // Store array size. - ++Idx; - Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, - CGF.getPointerSize()); - llvm::Value *Size = CGF.Builder.CreateIntCast( - CGF.getVLASize( - CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) - .NumElts, - CGF.SizeTy, /*isSigned=*/false); - CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), - Elem); - } - } - - // 2. Emit reduce_func(). - llvm::Value *ReductionFn = emitReductionFunction( - CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), - Privates, LHSExprs, RHSExprs, ReductionOps); - // 4. Build res = __kmpc_reduce{_nowait}(, , sizeof(RedList), + // Build res = __kmpc_reduce{_nowait}(, , sizeof(RedList), // RedList, shuffle_reduce_func, interwarp_copy_func); + // or + // Build res = __kmpc_reduce_teams_nowait_simple(, , ); llvm::Value *ThreadId = getThreadID(CGF, Loc); - llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy); - llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - ReductionList.getPointer(), CGF.VoidPtrTy); - - llvm::Value *ShuffleAndReduceFn = emitShuffleAndReduceFunction( - CGM, Privates, ReductionArrayTy, ReductionFn, Loc); - llvm::Value *InterWarpCopyFn = - emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); llvm::Value *Res; if (ParallelReduction) { + ASTContext &C = CGM.getContext(); + // 1. Build a list of reduction variables. + // void *RedList[] = {[0], ..., [-1]}; + auto Size = RHSExprs.size(); + for (const Expr *E : Privates) { + if (E->getType()->isVariablyModifiedType()) + // Reserve place for array size. + ++Size; + } + llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); + QualType ReductionArrayTy = + C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal, + /*IndexTypeQuals=*/0); + Address ReductionList = + CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); + auto IPriv = Privates.begin(); + unsigned Idx = 0; + for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) { + Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, + CGF.getPointerSize()); + CGF.Builder.CreateStore( + CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy), + Elem); + if ((*IPriv)->getType()->isVariablyModifiedType()) { + // Store array size. + ++Idx; + Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, + CGF.getPointerSize()); + llvm::Value *Size = CGF.Builder.CreateIntCast( + CGF.getVLASize( + CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) + .NumElts, + CGF.SizeTy, /*isSigned=*/false); + CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), + Elem); + } + } + + llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy); + llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + ReductionList.getPointer(), CGF.VoidPtrTy); + llvm::Value *ReductionFn = emitReductionFunction( + CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), + Privates, LHSExprs, RHSExprs, ReductionOps); + llvm::Value *ShuffleAndReduceFn = emitShuffleAndReduceFunction( + CGM, Privates, ReductionArrayTy, ReductionFn, Loc); + llvm::Value *InterWarpCopyFn = + emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); + llvm::Value *Args[] = {ThreadId, CGF.Builder.getInt32(RHSExprs.size()), ReductionArrayTySize, @@ -4132,21 +3860,13 @@ void CGOpenMPRuntimeNVPTX::emitReduction( Args); } else { assert(TeamsReduction && "expected teams reduction."); - llvm::Value *ScratchPadCopyFn = - emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc); - llvm::Value *LoadAndReduceFn = emitReduceScratchpadFunction( - CGM, Privates, ReductionArrayTy, ReductionFn, Loc); - - llvm::Value *Args[] = {ThreadId, - CGF.Builder.getInt32(RHSExprs.size()), - ReductionArrayTySize, - RL, - ShuffleAndReduceFn, - InterWarpCopyFn, - ScratchPadCopyFn, - LoadAndReduceFn}; + llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); + std::string Name = getName({"reduction"}); + llvm::Value *Lock = getCriticalRegionLock(Name); + llvm::Value *Args[] = {RTLoc, ThreadId, Lock}; Res = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait), + createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple), Args); } @@ -4164,7 +3884,6 @@ void CGOpenMPRuntimeNVPTX::emitReduction( CGF.EmitBlock(ThenBB); // Add emission of __kmpc_end_reduce{_nowait}(); - llvm::Value *EndArgs[] = {ThreadId}; auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps, this](CodeGenFunction &CGF, PrePostActionTy &Action) { auto IPriv = Privates.begin(); @@ -4178,13 +3897,30 @@ void CGOpenMPRuntimeNVPTX::emitReduction( ++IRHS; } }; - RegionCodeGenTy RCG(CodeGen); - NVPTXActionTy Action( - nullptr, llvm::None, - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait), - EndArgs); - RCG.setAction(Action); - RCG(CGF); + if (ParallelReduction) { + llvm::Value *EndArgs[] = {ThreadId}; + RegionCodeGenTy RCG(CodeGen); + NVPTXActionTy Action( + nullptr, llvm::None, + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait), + EndArgs); + RCG.setAction(Action); + RCG(CGF); + } else { + assert(TeamsReduction && "expected teams reduction."); + llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); + std::string Name = getName({"reduction"}); + llvm::Value *Lock = getCriticalRegionLock(Name); + llvm::Value *EndArgs[] = {RTLoc, ThreadId, Lock}; + RegionCodeGenTy RCG(CodeGen); + NVPTXActionTy Action( + nullptr, llvm::None, + createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple), + EndArgs); + RCG.setAction(Action); + RCG(CGF); + } // There is no need to emit line number for unconditional branch. (void)ApplyDebugLocation::CreateEmpty(CGF); CGF.EmitBlock(ExitBB, /*IsFinished=*/true); diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp index e2a103a..d0a31b0 100644 --- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -78,11 +78,7 @@ int bar(int n){ // CHECK: [[EV:%.+]] = load double, double* [[E]], align // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5 // CHECK: store double [[ADD]], double* [[E]], align - // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i[[SZ:32|64]] 0, i{{32|64}} 0 - // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8* - // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align - // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 1, i[[SZ]] {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]]) + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_simple(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], [8 x i32]* [[LOCK:@.+]]) // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] // @@ -91,977 +87,164 @@ int bar(int n){ // CHECK: [[EV:%.+]] = load double, double* [[E]], align // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]] // CHECK: store double [[ADD]], double* [[E_IN]], align - // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: call void @__kmpc_nvptx_teams_end_reduce_nowait_simple(%struct.ident_t* [[LOC]], i32 [[GTID]], [8 x i32]* [[LOCK]]) // CHECK: br label %[[EXIT]] // // CHECK: [[EXIT]] // CHECK: call void @__kmpc_kernel_deinit( - // - // Reduction function - // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) - // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]], - // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double* - // - // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]], - // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double* - // - // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]], - // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]], - // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]] - // CHECK: store double [[RES]], double* [[VAR_LHS]], - // CHECK: ret void - - // - // Shuffle and reduce function - // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align - // CHECK: [[REMOTE_ELT:%.+]] = alloca double - // - // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* - // - // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64* - // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64* - // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) - // - // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align - // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* - // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align - // - // Condition to reduce - // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 - // - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] - // - // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 - // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 - // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 - // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] - // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 - // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] - // - // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] - // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] - // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // CHECK: [[REDUCE_ELSE]] - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // Now check if we should just copy over the remote reduction list - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] - // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // CHECK: [[DO_COPY]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align - // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // CHECK: [[COPY_CONT]] - // CHECK: void - - // - // Inter warp copy function - // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) - // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 - // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 - // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]], - // CHECK: br label - // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]], - // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2 - // CHECK: br i1 [[DONE_COPY]], label - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]] - // - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], - // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]] - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], - // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1 - // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]], - // CHECK: br label - // CHECK: ret - - // - // Copy to scratchpad function - // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32) - // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align - // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 - // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 - // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align - // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // - // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 8, [[TEAM]] - // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]] - // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* - // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double* - // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align - // CHECK: store double [[ELT_VAL]], double* [[SCRATCHPAD_ELT_PTR]], align - // - // CHECK: ret - - // - // Load and reduce function - // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align - // CHECK: [[REMOTE_ELT:%.+]] = alloca double - // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align - // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 - // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 - // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align - // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align - // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] - // - // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 8, [[TEAM]] - // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]] - // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* - - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[SCRATCHPAD_ELT_PTR]], align - // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align - // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* - // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align - // - // CHECK: [[REDUCE:%.+]] = icmp ne i32 [[SHOULD_REDUCE]], 0 - // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // Copy element from remote reduce list - // CHECK: [[REDUCE_ELSE]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align - // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // CHECK: ret - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker() // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l43]]( // - // CHECK: {{call|invoke}} void [[T2]]_worker() - // - // CHECK: call void @__kmpc_kernel_init( - // - // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align - // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align - // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32 - // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2 - // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 - // CHECK: store i8 [[TRUNC]], i8* [[C]], align - // CHECK: [[DV:%.+]] = load float, float* [[D]], align - // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}} - // CHECK: store float [[MUL]], float* [[D]], align - // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: store i8* [[C]], i8** [[PTR1]], align - // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8* - // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align - // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]]) - // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 - // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] - // - // CHECK: [[IFLABEL]] - // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align - // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32 - // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align - // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32 - // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]] - // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 - // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align - // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align - // CHECK: [[DV:%.+]] = load float, float* [[D]], align - // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] - // CHECK: store float [[MUL]], float* [[D_IN]], align - // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( - // CHECK: br label %[[EXIT]] - // - // CHECK: [[EXIT]] - // CHECK: call void @__kmpc_kernel_deinit( - - // - // Reduction function - // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) - // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], - // - // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], - // - // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], - // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float* - // - // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], - // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float* - // - // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]], - // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32 - // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]], - // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32 - // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] - // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8 - // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]], - // - // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]], - // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]], - // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] - // CHECK: store float [[RES]], float* [[VAR2_LHS]], - // CHECK: ret void - - // - // Shuffle and reduce function - // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align - // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 - // CHECK: [[REMOTE_ELT2:%.+]] = alloca float - // - // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align - // - // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32 - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) - // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8 - // - // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align - // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* - // - // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32* - // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32* - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) - // - // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align - // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* - // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align - // - // Condition to reduce - // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 - // - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] - // - // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 - // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 - // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 - // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] - // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 - // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] - // - // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] - // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] - // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // CHECK: [[REDUCE_ELSE]] - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // Now check if we should just copy over the remote reduction list - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] - // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // CHECK: [[DO_COPY]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align - // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align - // - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align - // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // CHECK: [[COPY_CONT]] - // CHECK: void - - // - // Inter warp copy function - // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) - // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 - // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 - // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // - // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align - // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] - // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] - // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] - // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] - // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] - // - // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // CHECK: ret - - // - // Copy to scratchpad function - // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32) - // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align - // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 - // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 - // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align - // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // - // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 1, [[TEAM]] - // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]] - // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* - // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align - // CHECK: store i8 [[ELT_VAL]], i8* [[SCRATCHPAD_ELT_PTR]], align - // - // CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 1 - // CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]] - // CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1 - // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 128 - // CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1 - // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 128 - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // - // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 4, [[TEAM]] - // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]] - // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* - // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float* - // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align - // CHECK: store float [[ELT_VAL]], float* [[SCRATCHPAD_ELT_PTR]], align - // - // CHECK: ret - - // - // Load and reduce function - // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align - // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 - // CHECK: [[REMOTE_ELT2:%.+]] = alloca float - // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align - // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 - // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 - // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align - // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align - // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] - // - // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 1, [[TEAM]] - // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]] - // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* - - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[SCRATCHPAD_ELT_PTR_VOID]], align - // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[REMOTE_ELT1]], align - // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align - // - // CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 1 - // CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]] - // CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1 - // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 128 - // CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1 - // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 128 - // - // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 4, [[TEAM]] - // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]] - // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* - - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[SCRATCHPAD_ELT_PTR]], align - // CHECK: store float [[REMOTE_ELT_VAL]], float* [[REMOTE_ELT2]], align - // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* - // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align - // - // CHECK: [[REDUCE:%.+]] = icmp ne i32 [[SHOULD_REDUCE]], 0 - // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // Copy element from remote reduce list - // CHECK: [[REDUCE_ELSE]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align - // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align - // - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align - // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // CHECK: ret - - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l50}}( - // - // CHECK: call void @__kmpc_spmd_kernel_init( - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() - // CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY:%.+]], %{{.+}} addrspace(3)* [[KERNEL_RD:@.+]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} {{8|16}}, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR:@.+]] to i8**)) - // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], - // CHECK: [[GLOBAL_REC:%.+]] = bitcast i8* [[PTR]] to [[GLOB_REC_TY:%.+]]* - // CHECK-DAG: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 0 - // CHECK-DAG: [[B_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 1 - // CHECK: store i32 0, i32* [[A_ADDR]], - // CHECK: store i16 -32768, i16* [[B_ADDR]], - // CHECK: call void [[OUTLINED:@.+]](i32* {{.+}}, i32* {{.+}}, i32* [[A_ADDR]], i16* [[B_ADDR]]) - // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A_ADDR]] to i8* - // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align - // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B_ADDR]] to i8* - // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align - // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]]) - // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 - // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] - // - // CHECK: [[IFLABEL]] - // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align - // CHECK: [[AV:%.+]] = load i32, i32* [[A_ADDR]], align - // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] - // CHECK: store i32 [[OR]], i32* [[A_IN]], align - // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align - // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 - // CHECK: [[BV16:%.+]] = load i16, i16* [[B_ADDR]], align - // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 - // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[MAX2:%.+]] = load i16, i16* [[B_ADDR]], align - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] - // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align - // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( - // CHECK: br label %[[EXIT]] - // - // CHECK: [[EXIT]] - // call void @__kmpc_restore_team_static_memory(i16 1) - // CHECK: call void @__kmpc_spmd_kernel_deinit( - - // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.+}}, i16* dereferenceable{{.+}}) - // - // CHECK: store i32 0, i32* [[A:%.+]], align - // CHECK: store i16 -32768, i16* [[B:%.+]], align - // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align - // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1 - // CHECK: store i32 [[OR]], i32* [[A]], align - // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align - // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 - // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align - // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32 - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ] - // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16 - // CHECK: store i16 [[TRUNC]], i16* [[B]], align - // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8* - // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align - // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* - // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align - // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[PAR_SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[PAR_WARP_COPY_FN:@.+]]) - // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 - // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] - // - // CHECK: [[IFLABEL]] - // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align - // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align - // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] - // CHECK: store i32 [[OR]], i32* [[A_IN]], align - // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align - // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 - // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align - // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 - // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] - // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align - // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( - // CHECK: br label %[[EXIT]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - - // - // Reduction function - // CHECK: define internal void [[PAR_REDUCTION_FUNC:@.+]](i8*, i8*) - // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], - // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* - // - // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], - // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* - // - // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], - // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* - // - // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], - // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* - // - // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], - // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], - // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] - // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], - // - // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], - // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 - // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], - // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 - // - // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] - // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] - // - // CHECK: [[DO_MAX]] - // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align - // CHECK: br label {{%?}}[[MAX_CONT:.+]] - // - // CHECK: [[MAX_ELSE]] - // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align - // CHECK: br label {{%?}}[[MAX_CONT]] - // - // CHECK: [[MAX_CONT]] - // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] - // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], - // CHECK: ret void - - // - // Shuffle and reduce function - // CHECK: define internal void [[PAR_SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align - // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 - // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 - // - // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align - // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) - // - // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align - // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* - // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align - // - // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 - // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 - // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) - // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 - // - // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align - // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* - // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align - // - // Condition to reduce - // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 - // - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] - // - // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 - // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 - // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 - // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] - // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 - // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] - // - // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] - // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] - // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[PAR_REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // CHECK: [[REDUCE_ELSE]] - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // Now check if we should just copy over the remote reduction list - // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 - // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] - // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] - // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] - // - // CHECK: [[DO_COPY]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align - // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align - // - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align - // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // CHECK: {{call|invoke}} void [[T2]]_worker() // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] + // CHECK: call void @__kmpc_kernel_init( // - // CHECK: [[COPY_CONT]] - // CHECK: void - + // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align + // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align + // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2 + // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[TRUNC]], i8* [[C]], align + // CHECK: [[DV:%.+]] = load float, float* [[D]], align + // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}} + // CHECK: store float [[MUL]], float* [[D]], align + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_simple(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], [8 x i32]* [[LOCK:@.+]]) + // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] // - // Inter warp copy function - // CHECK: define internal void [[PAR_WARP_COPY_FN]](i8*, i32) - // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 - // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 - // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // CHECK: [[IFLABEL]] + // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align + // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32 + // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align + // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]] + // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align + // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align + // CHECK: [[DV:%.+]] = load float, float* [[D]], align + // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] + // CHECK: store float [[MUL]], float* [[D_IN]], align + // CHECK: call void @__kmpc_nvptx_teams_end_reduce_nowait_simple(%struct.ident_t* [[LOC]], i32 [[GTID]], [8 x i32]* [[LOCK]]) + // CHECK: br label %[[EXIT]] // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[EXIT]] + // CHECK: call void @__kmpc_kernel_deinit( + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l50}}( // - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() + // CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY:%.+]], %{{.+}} addrspace(3)* [[KERNEL_RD:@.+]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} {{8|16}}, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR:@.+]] to i8**)) + // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], + // CHECK: [[GLOBAL_REC:%.+]] = bitcast i8* [[PTR]] to [[GLOB_REC_TY:%.+]]* + // CHECK-DAG: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 0 + // CHECK-DAG: [[B_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 1 + // CHECK: store i32 0, i32* [[A_ADDR]], + // CHECK: store i16 -32768, i16* [[B_ADDR]], + // CHECK: call void [[OUTLINED:@.+]](i32* {{.+}}, i32* {{.+}}, i32* [[A_ADDR]], i16* [[B_ADDR]]) + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_simple(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], [8 x i32]* [[LOCK:@.+]]) + // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] + // CHECK: [[IFLABEL]] + // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align + // CHECK: [[AV:%.+]] = load i32, i32* [[A_ADDR]], align + // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] + // CHECK: store i32 [[OR]], i32* [[A_IN]], align + // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align + // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 + // CHECK: [[BV16:%.+]] = load i16, i16* [[B_ADDR]], align + // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[B_ADDR]], align + // CHECK: br label {{%?}}[[MAX_CONT]] // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] + // CHECK: [[MAX_CONT]] + // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align + // CHECK: call void @__kmpc_nvptx_teams_end_reduce_nowait_simple(%struct.ident_t* [[LOC]], i32 [[GTID]], [8 x i32]* [[LOCK]]) + // CHECK: br label %[[EXIT]] // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] + // CHECK: [[EXIT]] + // call void @__kmpc_restore_team_static_memory(i16 1) + // CHECK: call void @__kmpc_spmd_kernel_deinit( + + // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.+}}, i16* dereferenceable{{.+}}) // - // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 - // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // CHECK: store i32 0, i32* [[A:%.+]], align + // CHECK: store i16 -32768, i16* [[B:%.+]], align + // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align + // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1 + // CHECK: store i32 [[OR]], i32* [[A]], align + // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align + // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 + // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] // - // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[DO_MAX]] + // CHECK: br label {{%?}}[[MAX_CONT:.+]] // - // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align - // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // CHECK: [[MAX_ELSE]] + // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align + // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32 + // CHECK: br label {{%?}}[[MAX_CONT]] // - // CHECK: [[COPY_ELSE]] - // CHECK: br label {{%?}}[[COPY_CONT]] + // CHECK: [[MAX_CONT]] + // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ] + // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16 + // CHECK: store i16 [[TRUNC]], i16* [[B]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{.+}} 0, i[[SZ:.+]] 0 + // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8* + // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align + // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* + // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[PAR_SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[PAR_WARP_COPY_FN:@.+]]) + // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] // - // Barrier after copy to shared memory storage medium. - // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[IFLABEL]] + // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align + // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align + // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] + // CHECK: store i32 [[OR]], i32* [[A_IN]], align + // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align + // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 + // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align + // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] // - // Read into warp 0. - // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] - // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] // - // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align - // CHECK: br label {{%?}}[[READ_CONT:.+]] + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align + // CHECK: br label {{%?}}[[MAX_CONT]] // - // CHECK: [[READ_ELSE]] - // CHECK: br label {{%?}}[[READ_CONT]] + // CHECK: [[MAX_CONT]] + // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[EXIT]] // - // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) - // CHECK: ret + // CHECK: [[EXIT]] + // CHECK: ret void // // Reduction function - // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: define internal void [[PAR_REDUCTION_FUNC:@.+]](i8*, i8*) // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* @@ -1106,7 +289,7 @@ int bar(int n){ // // Shuffle and reduce function - // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: define internal void [[PAR_SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 @@ -1166,7 +349,7 @@ int bar(int n){ // CHECK: [[DO_REDUCE]] // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: call void [[PAR_REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] // // CHECK: [[REDUCE_ELSE]] @@ -1207,7 +390,7 @@ int bar(int n){ // // Inter warp copy function - // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK: define internal void [[PAR_WARP_COPY_FN]](i8*, i32) // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* @@ -1295,128 +478,4 @@ int bar(int n){ // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) // CHECK: ret - // - // Copy to scratchpad function - // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32) - // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align - // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 - // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 - // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align - // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // - // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 4, [[TEAM]] - // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]] - // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32* - // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // CHECK: store i32 [[ELT_VAL]], i32* [[SCRATCHPAD_ELT_PTR]], align - // - // CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 4 - // CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]] - // CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1 - // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 128 - // CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1 - // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 128 - // - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // - // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 2, [[TEAM]] - // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]] - // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16* - // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align - // CHECK: store i16 [[ELT_VAL]], i16* [[SCRATCHPAD_ELT_PTR]], align - // - // CHECK: ret - - // - // Load and reduce function - // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32) - // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align - // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 - // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 - // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* - // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align - // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 - // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align - // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 - // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align - // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align - // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] - // - // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 4, [[TEAM]] - // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]] - // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* - - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[SCRATCHPAD_ELT_PTR]], align - // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[REMOTE_ELT1]], align - // CHECK: [[REMOTE_ELT1_PTR:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* - // CHECK: store i8* [[REMOTE_ELT1_PTR]], i8** [[REMOTE_ELT_REF]], align - // - // CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 4 - // CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]] - // CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1 - // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 128 - // CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1 - // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 128 - // - // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 2, [[TEAM]] - // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]] - // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* - - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[SCRATCHPAD_ELT_PTR]], align - // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[REMOTE_ELT2]], align - // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* - // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align - // - // CHECK: [[REDUCE:%.+]] = icmp ne i32 [[SHOULD_REDUCE]], 0 - // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] - // - // CHECK: [[DO_REDUCE]] - // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* - // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* - // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) - // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] - // - // Copy element from remote reduce list - // CHECK: [[REDUCE_ELSE]] - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align - // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align - // - // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 - // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align - // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align - // CHECK: br label {{%?}}[[REDUCE_CONT]] - // - // CHECK: [[REDUCE_CONT]] - // CHECK: ret - - #endif -- 2.7.4