From: Carlo Bertolli Date: Tue, 7 Dec 2021 21:00:11 +0000 (+0000) Subject: [NFC][OpenMP] Prepare amdgpu plugin for asynchronous implementation of target region... X-Git-Tag: upstream/15.0.7~23723 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d9b1d827d2e9ae135901b6eccf25a05ef49f38af;p=platform%2Fupstream%2Fllvm.git [NFC][OpenMP] Prepare amdgpu plugin for asynchronous implementation of target region launch At present, amdgpu plugin merges both asynchronous and synchronous kernel launch implementations into a single synchronous version. This patch prepares the plugin for asynchronous implementation by: - Privatizing actual kernel launch code (valid in both cases) into an anonymous namespace base function Actual separation of kernel launch code (async vs sync) is a following patch. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D115267 --- diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 45d9476..5434692 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -883,113 +883,473 @@ void finiAsyncInfo(__tgt_async_info *AsyncInfo) { AsyncInfo->Queue = 0; } -bool elf_machine_id_is_amdgcn(__tgt_device_image *image) { - const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h - int32_t r = elf_check_machine(image, amdgcnMachineID); - if (!r) { - DP("Supported machine ID not found\n"); - } - return r; -} +// Determine launch values for kernel. +struct launchVals { + int WorkgroupSize; + int GridSize; +}; +launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env, + int ConstWGSize, + llvm::omp::OMPTgtExecModeFlags ExecutionMode, + int num_teams, int thread_limit, + uint64_t loop_tripcount, int DeviceNumTeams) { -uint32_t elf_e_flags(__tgt_device_image *image) { - char *img_begin = (char *)image->ImageStart; - size_t img_size = (char *)image->ImageEnd - img_begin; + int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size; + int num_groups = 0; - Elf *e = elf_memory(img_begin, img_size); - if (!e) { - DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); - return 0; + int Max_Teams = + Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; + if (Max_Teams > RTLDeviceInfoTy::HardTeamLimit) + Max_Teams = RTLDeviceInfoTy::HardTeamLimit; + + if (print_kernel_trace & STARTUP_DETAILS) { + DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams); + DP("Max_Teams: %d\n", Max_Teams); + DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize); + DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::Max_WG_Size); + DP("RTLDeviceInfoTy::Default_WG_Size: %d\n", + RTLDeviceInfoTy::Default_WG_Size); + DP("thread_limit: %d\n", thread_limit); + DP("threadsPerGroup: %d\n", threadsPerGroup); + DP("ConstWGSize: %d\n", ConstWGSize); + } + // check for thread_limit() clause + if (thread_limit > 0) { + threadsPerGroup = thread_limit; + DP("Setting threads per block to requested %d\n", thread_limit); + // Add master warp for GENERIC + if (ExecutionMode == + llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { + threadsPerGroup += WarpSize; + DP("Adding master wavefront: +%d threads\n", WarpSize); + } + if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max + threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size; + DP("Setting threads per block to maximum %d\n", threadsPerGroup); + } + } + // check flat_max_work_group_size attr here + if (threadsPerGroup > ConstWGSize) { + threadsPerGroup = ConstWGSize; + DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", + threadsPerGroup); } + if (print_kernel_trace & STARTUP_DETAILS) + DP("threadsPerGroup: %d\n", threadsPerGroup); + DP("Preparing %d threads\n", threadsPerGroup); - Elf64_Ehdr *eh64 = elf64_getehdr(e); + // Set default num_groups (teams) + if (Env.TeamLimit > 0) + num_groups = (Max_Teams < Env.TeamLimit) ? Max_Teams : Env.TeamLimit; + else + num_groups = Max_Teams; + DP("Set default num of groups %d\n", num_groups); - if (!eh64) { - DP("Unable to get machine ID from ELF file!\n"); - elf_end(e); - return 0; + if (print_kernel_trace & STARTUP_DETAILS) { + DP("num_groups: %d\n", num_groups); + DP("num_teams: %d\n", num_teams); } - uint32_t Flags = eh64->e_flags; - - elf_end(e); - DP("ELF Flags: 0x%x\n", Flags); - return Flags; -} -} // namespace + // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size + // This reduction is typical for default case (no thread_limit clause). + // or when user goes crazy with num_teams clause. + // FIXME: We cant distinguish between a constant or variable thread limit. + // So we only handle constant thread_limits. + if (threadsPerGroup > + RTLDeviceInfoTy::Default_WG_Size) // 256 < threadsPerGroup <= 1024 + // Should we round threadsPerGroup up to nearest WarpSize + // here? + num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup; -int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { - return elf_machine_id_is_amdgcn(image); -} + // check for num_teams() clause + if (num_teams > 0) { + num_groups = (num_teams < num_groups) ? num_teams : num_groups; + } + if (print_kernel_trace & STARTUP_DETAILS) { + DP("num_groups: %d\n", num_groups); + DP("Env.NumTeams %d\n", Env.NumTeams); + DP("Env.TeamLimit %d\n", Env.TeamLimit); + } -int __tgt_rtl_number_of_devices() { - // If the construction failed, no methods are safe to call - if (DeviceInfo.ConstructionSucceeded) { - return DeviceInfo.NumberOfDevices; + if (Env.NumTeams > 0) { + num_groups = (Env.NumTeams < num_groups) ? Env.NumTeams : num_groups; + DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams); + } else if (Env.TeamLimit > 0) { + num_groups = (Env.TeamLimit < num_groups) ? Env.TeamLimit : num_groups; + DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit); } else { - DP("AMDGPU plugin construction failed. Zero devices available\n"); - return 0; + if (num_teams <= 0) { + if (loop_tripcount > 0) { + if (ExecutionMode == + llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) { + // round up to the nearest integer + num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1; + } else if (ExecutionMode == + llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { + num_groups = loop_tripcount; + } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ { + // This is a generic kernel that was transformed to use SPMD-mode + // execution but uses Generic-mode semantics for scheduling. + num_groups = loop_tripcount; + } + DP("Using %d teams due to loop trip count %" PRIu64 " and number of " + "threads per block %d\n", + num_groups, loop_tripcount, threadsPerGroup); + } + } else { + num_groups = num_teams; + } + if (num_groups > Max_Teams) { + num_groups = Max_Teams; + if (print_kernel_trace & STARTUP_DETAILS) + DP("Limiting num_groups %d to Max_Teams %d \n", num_groups, Max_Teams); + } + if (num_groups > num_teams && num_teams > 0) { + num_groups = num_teams; + if (print_kernel_trace & STARTUP_DETAILS) + DP("Limiting num_groups %d to clause num_teams %d \n", num_groups, + num_teams); + } } -} -int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { - DP("Init requires flags to %ld\n", RequiresFlags); - DeviceInfo.RequiresFlags = RequiresFlags; - return RequiresFlags; + // num_teams clause always honored, no matter what, unless DEFAULT is active. + if (num_teams > 0) { + num_groups = num_teams; + // Cap num_groups to EnvMaxTeamsDefault if set. + if (Env.MaxTeamsDefault > 0 && num_groups > Env.MaxTeamsDefault) + num_groups = Env.MaxTeamsDefault; + } + if (print_kernel_trace & STARTUP_DETAILS) { + DP("threadsPerGroup: %d\n", threadsPerGroup); + DP("num_groups: %d\n", num_groups); + DP("loop_tripcount: %ld\n", loop_tripcount); + } + DP("Final %d num_groups and %d threadsPerGroup\n", num_groups, + threadsPerGroup); + + launchVals res; + res.WorkgroupSize = threadsPerGroup; + res.GridSize = threadsPerGroup * num_groups; + return res; } -namespace { -template bool enforce_upper_bound(T *value, T upper) { - bool changed = *value > upper; - if (changed) { - *value = upper; +static uint64_t acquire_available_packet_id(hsa_queue_t *queue) { + uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); + bool full = true; + while (full) { + full = + packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue)); } - return changed; + return packet_id; } -} // namespace -int32_t __tgt_rtl_init_device(int device_id) { - hsa_status_t err; +int32_t __tgt_rtl_run_target_team_region_locked( + int32_t device_id, void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams, + int32_t thread_limit, uint64_t loop_tripcount) { + // Set the context we are using + // update thread limit content in gpu memory if un-initialized or specified + // from host - // this is per device id init - DP("Initialize the device id: %d\n", device_id); + DP("Run target team region thread_limit %d\n", thread_limit); - hsa_agent_t agent = DeviceInfo.HSAAgents[device_id]; + // All args are references. + std::vector args(arg_num); + std::vector ptrs(arg_num); - // Get number of Compute Unit - uint32_t compute_units = 0; - err = hsa_agent_get_info( - agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, - &compute_units); - if (err != HSA_STATUS_SUCCESS) { - DeviceInfo.ComputeUnits[device_id] = 1; - DP("Error getting compute units : settiing to 1\n"); - } else { - DeviceInfo.ComputeUnits[device_id] = compute_units; - DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]); + DP("Arg_num: %d\n", arg_num); + for (int32_t i = 0; i < arg_num; ++i) { + ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); + args[i] = &ptrs[i]; + DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i])); } - char GetInfoName[64]; // 64 max size returned by get info - err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, - (void *)GetInfoName); - if (err) - DeviceInfo.GPUName[device_id] = "--unknown gpu--"; - else { - DeviceInfo.GPUName[device_id] = GetInfoName; + KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; + + std::string kernel_name = std::string(KernelInfo->Name); + auto &KernelInfoTable = DeviceInfo.KernelInfoTable; + if (KernelInfoTable[device_id].find(kernel_name) == + KernelInfoTable[device_id].end()) { + DP("Kernel %s not found\n", kernel_name.c_str()); + return OFFLOAD_FAIL; } - if (print_kernel_trace & STARTUP_DETAILS) - DP("Device#%-2d CU's: %2d %s\n", device_id, - DeviceInfo.ComputeUnits[device_id], - DeviceInfo.GPUName[device_id].c_str()); + const atl_kernel_info_t KernelInfoEntry = + KernelInfoTable[device_id][kernel_name]; + const uint32_t group_segment_size = KernelInfoEntry.group_segment_size; + const uint32_t sgpr_count = KernelInfoEntry.sgpr_count; + const uint32_t vgpr_count = KernelInfoEntry.vgpr_count; + const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count; + const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count; - // Query attributes to determine number of threads/block and blocks/grid. - uint16_t workgroup_max_dim[3]; - err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, - &workgroup_max_dim); - if (err != HSA_STATUS_SUCCESS) { - DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams; - DP("Error getting grid dims: num groups : %d\n", + assert(arg_num == (int)KernelInfoEntry.explicit_argument_count); + + /* + * Set limit based on ThreadsPerGroup and GroupsPerDevice + */ + launchVals LV = + getLaunchVals(DeviceInfo.WarpSize[device_id], DeviceInfo.Env, + KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, + num_teams, // From run_region arg + thread_limit, // From run_region arg + loop_tripcount, // From run_region arg + DeviceInfo.NumTeams[KernelInfo->device_id]); + const int GridSize = LV.GridSize; + const int WorkgroupSize = LV.WorkgroupSize; + + if (print_kernel_trace >= LAUNCH) { + int num_groups = GridSize / WorkgroupSize; + // enum modes are SPMD, GENERIC, NONE 0,1,2 + // if doing rtl timing, print to stderr, unless stdout requested. + bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); + fprintf(traceToStdout ? stdout : stderr, + "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " + "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " + "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", + device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, + arg_num, num_groups, WorkgroupSize, num_teams, thread_limit, + group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count, + vgpr_spill_count, loop_tripcount, KernelInfo->Name); + } + + // Run on the device. + { + hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get(); + if (!queue) { + return OFFLOAD_FAIL; + } + uint64_t packet_id = acquire_available_packet_id(queue); + + const uint32_t mask = queue->size - 1; // size is a power of 2 + hsa_kernel_dispatch_packet_t *packet = + (hsa_kernel_dispatch_packet_t *)queue->base_address + + (packet_id & mask); + + // packet->header is written last + packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + packet->workgroup_size_x = WorkgroupSize; + packet->workgroup_size_y = 1; + packet->workgroup_size_z = 1; + packet->reserved0 = 0; + packet->grid_size_x = GridSize; + packet->grid_size_y = 1; + packet->grid_size_z = 1; + packet->private_segment_size = KernelInfoEntry.private_segment_size; + packet->group_segment_size = KernelInfoEntry.group_segment_size; + packet->kernel_object = KernelInfoEntry.kernel_object; + packet->kernarg_address = 0; // use the block allocator + packet->reserved2 = 0; // impl writes id_ here + packet->completion_signal = {0}; // may want a pool of signals + + KernelArgPool *ArgPool = nullptr; + void *kernarg = nullptr; + { + auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name)); + if (it != KernelArgPoolMap.end()) { + ArgPool = (it->second).get(); + } + } + if (!ArgPool) { + DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, + device_id); + } + { + if (ArgPool) { + assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *))); + kernarg = ArgPool->allocate(arg_num); + } + if (!kernarg) { + DP("Allocate kernarg failed\n"); + return OFFLOAD_FAIL; + } + + // Copy explicit arguments + for (int i = 0; i < arg_num; i++) { + memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *)); + } + + // Initialize implicit arguments. TODO: Which of these can be dropped + impl_implicit_args_t *impl_args = + reinterpret_cast( + static_cast(kernarg) + ArgPool->kernarg_segment_size); + memset(impl_args, 0, + sizeof(impl_implicit_args_t)); // may not be necessary + impl_args->offset_x = 0; + impl_args->offset_y = 0; + impl_args->offset_z = 0; + + // assign a hostcall buffer for the selected Q + if (__atomic_load_n(&DeviceInfo.hostcall_required, __ATOMIC_ACQUIRE)) { + // hostrpc_assign_buffer is not thread safe, and this function is + // under a multiple reader lock, not a writer lock. + static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER; + pthread_mutex_lock(&hostcall_init_lock); + unsigned long buffer = hostrpc_assign_buffer( + DeviceInfo.HSAAgents[device_id], queue, device_id); + pthread_mutex_unlock(&hostcall_init_lock); + if (!buffer) { + DP("hostrpc_assign_buffer failed, gpu would dereference null and " + "error\n"); + return OFFLOAD_FAIL; + } + + if (KernelInfoEntry.implicit_argument_count >= 4) { + // Initialise pointer for implicit_argument_count != 0 ABI + // Guess that the right implicit argument is at offset 24 after + // the explicit arguments. In the future, should be able to read + // the offset from msgpack. Clang is not annotating it at present. + uint64_t Offset = + sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); + if ((Offset + 8) > (ArgPool->kernarg_segment_size)) { + DP("Bad offset of hostcall, exceeds kernarg segment size\n"); + } else { + memcpy(static_cast(kernarg) + Offset, &buffer, 8); + } + } + + // initialise pointer for implicit_argument_count == 0 ABI + impl_args->hostcall_ptr = buffer; + } + + packet->kernarg_address = kernarg; + } + + hsa_signal_t s = DeviceInfo.FreeSignalPool.pop(); + if (s.handle == 0) { + DP("Failed to get signal instance\n"); + return OFFLOAD_FAIL; + } + packet->completion_signal = s; + hsa_signal_store_relaxed(packet->completion_signal, 1); + + // Publish the packet indicating it is ready to be processed + core::packet_store_release(reinterpret_cast(packet), + core::create_header(), packet->setup); + + // Since the packet is already published, its contents must not be + // accessed any more + hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); + + while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED) != 0) + ; + + assert(ArgPool); + ArgPool->deallocate(kernarg); + DeviceInfo.FreeSignalPool.push(s); + } + + DP("Kernel completed\n"); + return OFFLOAD_SUCCESS; +} + +bool elf_machine_id_is_amdgcn(__tgt_device_image *image) { + const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h + int32_t r = elf_check_machine(image, amdgcnMachineID); + if (!r) { + DP("Supported machine ID not found\n"); + } + return r; +} + +uint32_t elf_e_flags(__tgt_device_image *image) { + char *img_begin = (char *)image->ImageStart; + size_t img_size = (char *)image->ImageEnd - img_begin; + + Elf *e = elf_memory(img_begin, img_size); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return 0; + } + + Elf64_Ehdr *eh64 = elf64_getehdr(e); + + if (!eh64) { + DP("Unable to get machine ID from ELF file!\n"); + elf_end(e); + return 0; + } + + uint32_t Flags = eh64->e_flags; + + elf_end(e); + DP("ELF Flags: 0x%x\n", Flags); + return Flags; +} +} // namespace + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { + return elf_machine_id_is_amdgcn(image); +} + +int __tgt_rtl_number_of_devices() { + // If the construction failed, no methods are safe to call + if (DeviceInfo.ConstructionSucceeded) { + return DeviceInfo.NumberOfDevices; + } else { + DP("AMDGPU plugin construction failed. Zero devices available\n"); + return 0; + } +} + +int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { + DP("Init requires flags to %ld\n", RequiresFlags); + DeviceInfo.RequiresFlags = RequiresFlags; + return RequiresFlags; +} + +namespace { +template bool enforce_upper_bound(T *value, T upper) { + bool changed = *value > upper; + if (changed) { + *value = upper; + } + return changed; +} +} // namespace + +int32_t __tgt_rtl_init_device(int device_id) { + hsa_status_t err; + + // this is per device id init + DP("Initialize the device id: %d\n", device_id); + + hsa_agent_t agent = DeviceInfo.HSAAgents[device_id]; + + // Get number of Compute Unit + uint32_t compute_units = 0; + err = hsa_agent_get_info( + agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, + &compute_units); + if (err != HSA_STATUS_SUCCESS) { + DeviceInfo.ComputeUnits[device_id] = 1; + DP("Error getting compute units : settiing to 1\n"); + } else { + DeviceInfo.ComputeUnits[device_id] = compute_units; + DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]); + } + + char GetInfoName[64]; // 64 max size returned by get info + err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, + (void *)GetInfoName); + if (err) + DeviceInfo.GPUName[device_id] = "--unknown gpu--"; + else { + DeviceInfo.GPUName[device_id] = GetInfoName; + } + + if (print_kernel_trace & STARTUP_DETAILS) + DP("Device#%-2d CU's: %2d %s\n", device_id, + DeviceInfo.ComputeUnits[device_id], + DeviceInfo.GPUName[device_id].c_str()); + + // Query attributes to determine number of threads/block and blocks/grid. + uint16_t workgroup_max_dim[3]; + err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, + &workgroup_max_dim); + if (err != HSA_STATUS_SUCCESS) { + DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams; + DP("Error getting grid dims: num groups : %d\n", RTLDeviceInfoTy::DefaultNumTeams); } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) { DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0]; @@ -1469,769 +1829,404 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, const char *DeviceName = DeviceInfo.GPUName[device_id].c_str(); const char *ElfName = get_elf_mach_gfx_name(elf_e_flags(image)); - if (strcmp(DeviceName, ElfName) != 0) { - DP("Possible gpu arch mismatch: device:%s, image:%s please check" - " compiler flag: -march=\n", - DeviceName, ElfName); - } else { - DP("Error loading image onto GPU: %s\n", get_error_string(err)); - } - - return NULL; - } - - err = env.after_loading(); - if (err != HSA_STATUS_SUCCESS) { - return NULL; - } - } - - DP("AMDGPU module successfully loaded!\n"); - - { - // the device_State array is either large value in bss or a void* that - // needs to be assigned to a pointer to an array of size device_state_bytes - // If absent, it has been deadstripped and needs no setup. - - void *state_ptr; - uint32_t state_ptr_size; - auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; - hsa_status_t err = interop_hsa_get_symbol_info( - SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr, - &state_ptr_size); - - if (err != HSA_STATUS_SUCCESS) { - DP("No device_state symbol found, skipping initialization\n"); - } else { - if (state_ptr_size < sizeof(void *)) { - DP("unexpected size of state_ptr %u != %zu\n", state_ptr_size, - sizeof(void *)); - return NULL; - } - - // if it's larger than a void*, assume it's a bss array and no further - // initialization is required. Only try to set up a pointer for - // sizeof(void*) - if (state_ptr_size == sizeof(void *)) { - uint64_t device_State_bytes = - get_device_State_bytes((char *)image->ImageStart, img_size); - if (device_State_bytes == 0) { - DP("Can't initialize device_State, missing size information\n"); - return NULL; - } - - auto &dss = DeviceInfo.deviceStateStore[device_id]; - if (dss.first.get() == nullptr) { - assert(dss.second == 0); - void *ptr = NULL; - hsa_status_t err = impl_calloc(&ptr, device_State_bytes, device_id); - if (err != HSA_STATUS_SUCCESS) { - DP("Failed to allocate device_state array\n"); - return NULL; - } - dss = { - std::unique_ptr{ptr}, - device_State_bytes, - }; - } - - void *ptr = dss.first.get(); - if (device_State_bytes != dss.second) { - DP("Inconsistent sizes of device_State unsupported\n"); - return NULL; - } - - // write ptr to device memory so it can be used by later kernels - err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, - sizeof(void *), device_id); - if (err != HSA_STATUS_SUCCESS) { - DP("memcpy install of state_ptr failed\n"); - return NULL; - } - } - } - } - - // Here, we take advantage of the data that is appended after img_end to get - // the symbols' name we need to load. This data consist of the host entries - // begin and end as well as the target name (see the offloading linker script - // creation in clang compiler). - - // Find the symbols in the module by name. The name can be obtain by - // concatenating the host entry name with the target name - - __tgt_offload_entry *HostBegin = image->EntriesBegin; - __tgt_offload_entry *HostEnd = image->EntriesEnd; - - for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { - - if (!e->addr) { - // The host should have always something in the address to - // uniquely identify the target region. - DP("Analyzing host entry '' (size = %lld)...\n", - (unsigned long long)e->size); - return NULL; - } - - if (e->size) { - __tgt_offload_entry entry = *e; - - void *varptr; - uint32_t varsize; - - auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; - hsa_status_t err = interop_hsa_get_symbol_info( - SymbolInfoMap, device_id, e->name, &varptr, &varsize); - - if (err != HSA_STATUS_SUCCESS) { - // Inform the user what symbol prevented offloading - DP("Loading global '%s' (Failed)\n", e->name); - return NULL; - } - - if (varsize != e->size) { - DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name, - varsize, e->size); - return NULL; - } - - DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", - DPxPTR(e - HostBegin), e->name, DPxPTR(varptr)); - entry.addr = (void *)varptr; - - DeviceInfo.addOffloadEntry(device_id, entry); - - if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && - e->flags & OMP_DECLARE_TARGET_LINK) { - // If unified memory is present any target link variables - // can access host addresses directly. There is no longer a - // need for device copies. - err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr, - sizeof(void *), device_id); - if (err != HSA_STATUS_SUCCESS) - DP("Error when copying USM\n"); - DP("Copy linked variable host address (" DPxMOD ")" - "to device address (" DPxMOD ")\n", - DPxPTR(*((void **)e->addr)), DPxPTR(varptr)); - } - - continue; - } - - DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name)); - - // errors in kernarg_segment_size previously treated as = 0 (or as undef) - uint32_t kernarg_segment_size = 0; - auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id]; - hsa_status_t err = HSA_STATUS_SUCCESS; - if (!e->name) { - err = HSA_STATUS_ERROR; - } else { - std::string kernelStr = std::string(e->name); - auto It = KernelInfoMap.find(kernelStr); - if (It != KernelInfoMap.end()) { - atl_kernel_info_t info = It->second; - kernarg_segment_size = info.kernel_segment_size; - } else { - err = HSA_STATUS_ERROR; - } - } - - // default value GENERIC (in case symbol is missing from cubin file) - llvm::omp::OMPTgtExecModeFlags ExecModeVal = - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; - - // get flat group size if present, else Default_WG_Size - int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; - - // get Kernel Descriptor if present. - // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp - struct KernDescValType { - uint16_t Version; - uint16_t TSize; - uint16_t WG_Size; - }; - struct KernDescValType KernDescVal; - std::string KernDescNameStr(e->name); - KernDescNameStr += "_kern_desc"; - const char *KernDescName = KernDescNameStr.c_str(); - - void *KernDescPtr; - uint32_t KernDescSize; - void *CallStackAddr = nullptr; - err = interop_get_symbol_info((char *)image->ImageStart, img_size, - KernDescName, &KernDescPtr, &KernDescSize); - - if (err == HSA_STATUS_SUCCESS) { - if ((size_t)KernDescSize != sizeof(KernDescVal)) - DP("Loading global computation properties '%s' - size mismatch (%u != " - "%lu)\n", - KernDescName, KernDescSize, sizeof(KernDescVal)); - - memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); - - // Check structure size against recorded size. - if ((size_t)KernDescSize != KernDescVal.TSize) - DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", - sizeof(KernDescVal), KernDescVal.TSize, KernDescName); - - DP("After loading global for %s KernDesc \n", KernDescName); - DP("KernDesc: Version: %d\n", KernDescVal.Version); - DP("KernDesc: TSize: %d\n", KernDescVal.TSize); - DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size); - - if (KernDescVal.WG_Size == 0) { - KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size; - DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size); - } - WGSizeVal = KernDescVal.WG_Size; - DP("WGSizeVal %d\n", WGSizeVal); - check("Loading KernDesc computation property", err); - } else { - DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); - - // Flat group size - std::string WGSizeNameStr(e->name); - WGSizeNameStr += "_wg_size"; - const char *WGSizeName = WGSizeNameStr.c_str(); - - void *WGSizePtr; - uint32_t WGSize; - err = interop_get_symbol_info((char *)image->ImageStart, img_size, - WGSizeName, &WGSizePtr, &WGSize); - - if (err == HSA_STATUS_SUCCESS) { - if ((size_t)WGSize != sizeof(int16_t)) { - DP("Loading global computation properties '%s' - size mismatch (%u " - "!= " - "%lu)\n", - WGSizeName, WGSize, sizeof(int16_t)); - return NULL; - } - - memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); - - DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); - - if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size || - WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) { - DP("Error wrong WGSize value specified in HSA code object file: " - "%d\n", - WGSizeVal); - WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; - } + if (strcmp(DeviceName, ElfName) != 0) { + DP("Possible gpu arch mismatch: device:%s, image:%s please check" + " compiler flag: -march=\n", + DeviceName, ElfName); } else { - DP("Warning: Loading WGSize '%s' - symbol not found, " - "using default value %d\n", - WGSizeName, WGSizeVal); + DP("Error loading image onto GPU: %s\n", get_error_string(err)); } - check("Loading WGSize computation property", err); + return NULL; } - // Read execution mode from global in binary - std::string ExecModeNameStr(e->name); - ExecModeNameStr += "_exec_mode"; - const char *ExecModeName = ExecModeNameStr.c_str(); - - void *ExecModePtr; - uint32_t varsize; - err = interop_get_symbol_info((char *)image->ImageStart, img_size, - ExecModeName, &ExecModePtr, &varsize); + err = env.after_loading(); + if (err != HSA_STATUS_SUCCESS) { + return NULL; + } + } - if (err == HSA_STATUS_SUCCESS) { - if ((size_t)varsize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { - DP("Loading global computation properties '%s' - size mismatch(%u != " - "%lu)\n", - ExecModeName, varsize, sizeof(llvm::omp::OMPTgtExecModeFlags)); - return NULL; - } + DP("AMDGPU module successfully loaded!\n"); - memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize); + { + // the device_State array is either large value in bss or a void* that + // needs to be assigned to a pointer to an array of size device_state_bytes + // If absent, it has been deadstripped and needs no setup. - DP("After loading global for %s ExecMode = %d\n", ExecModeName, - ExecModeVal); + void *state_ptr; + uint32_t state_ptr_size; + auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; + hsa_status_t err = interop_hsa_get_symbol_info( + SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr, + &state_ptr_size); - if (ExecModeVal < 0 || - ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { - DP("Error wrong exec_mode value specified in HSA code object file: " - "%d\n", - ExecModeVal); + if (err != HSA_STATUS_SUCCESS) { + DP("No device_state symbol found, skipping initialization\n"); + } else { + if (state_ptr_size < sizeof(void *)) { + DP("unexpected size of state_ptr %u != %zu\n", state_ptr_size, + sizeof(void *)); return NULL; } - } else { - DP("Loading global exec_mode '%s' - symbol missing, using default " - "value " - "GENERIC (1)\n", - ExecModeName); - } - check("Loading computation property", err); - - KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id, - CallStackAddr, e->name, kernarg_segment_size, - DeviceInfo.KernArgPool)); - __tgt_offload_entry entry = *e; - entry.addr = (void *)&KernelsList.back(); - DeviceInfo.addOffloadEntry(device_id, entry); - DP("Entry point %ld maps to %s\n", e - HostBegin, e->name); - } - - return DeviceInfo.getOffloadEntriesTable(device_id); -} - -void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) { - void *ptr = NULL; - assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); - - if (kind != TARGET_ALLOC_DEFAULT) { - REPORT("Invalid target data allocation kind or requested allocator not " - "implemented yet\n"); - return NULL; - } - hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(device_id); - hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, &ptr); - DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size, - (long long unsigned)(Elf64_Addr)ptr); - ptr = (err == HSA_STATUS_SUCCESS) ? ptr : NULL; - return ptr; -} + // if it's larger than a void*, assume it's a bss array and no further + // initialization is required. Only try to set up a pointer for + // sizeof(void*) + if (state_ptr_size == sizeof(void *)) { + uint64_t device_State_bytes = + get_device_State_bytes((char *)image->ImageStart, img_size); + if (device_State_bytes == 0) { + DP("Can't initialize device_State, missing size information\n"); + return NULL; + } -int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr, - int64_t size) { - assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); - __tgt_async_info AsyncInfo; - int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo); - if (rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; + auto &dss = DeviceInfo.deviceStateStore[device_id]; + if (dss.first.get() == nullptr) { + assert(dss.second == 0); + void *ptr = NULL; + hsa_status_t err = impl_calloc(&ptr, device_State_bytes, device_id); + if (err != HSA_STATUS_SUCCESS) { + DP("Failed to allocate device_state array\n"); + return NULL; + } + dss = { + std::unique_ptr{ptr}, + device_State_bytes, + }; + } - return __tgt_rtl_synchronize(device_id, &AsyncInfo); -} + void *ptr = dss.first.get(); + if (device_State_bytes != dss.second) { + DP("Inconsistent sizes of device_State unsupported\n"); + return NULL; + } -int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr, - int64_t size, __tgt_async_info *AsyncInfo) { - assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); - if (AsyncInfo) { - initAsyncInfo(AsyncInfo); - return dataSubmit(device_id, tgt_ptr, hst_ptr, size, AsyncInfo); - } else { - return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size); + // write ptr to device memory so it can be used by later kernels + err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, + sizeof(void *), device_id); + if (err != HSA_STATUS_SUCCESS) { + DP("memcpy install of state_ptr failed\n"); + return NULL; + } + } + } } -} -int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr, - int64_t size) { - assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); - __tgt_async_info AsyncInfo; - int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo); - if (rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; + // Here, we take advantage of the data that is appended after img_end to get + // the symbols' name we need to load. This data consist of the host entries + // begin and end as well as the target name (see the offloading linker script + // creation in clang compiler). - return __tgt_rtl_synchronize(device_id, &AsyncInfo); -} + // Find the symbols in the module by name. The name can be obtain by + // concatenating the host entry name with the target name -int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr, - void *tgt_ptr, int64_t size, - __tgt_async_info *AsyncInfo) { - assert(AsyncInfo && "AsyncInfo is nullptr"); - assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); - initAsyncInfo(AsyncInfo); - return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, AsyncInfo); -} + __tgt_offload_entry *HostBegin = image->EntriesBegin; + __tgt_offload_entry *HostEnd = image->EntriesEnd; -int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) { - assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); - hsa_status_t err; - DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr); - err = core::Runtime::Memfree(tgt_ptr); - if (err != HSA_STATUS_SUCCESS) { - DP("Error when freeing CUDA memory\n"); - return OFFLOAD_FAIL; - } - return OFFLOAD_SUCCESS; -} + for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { -// Determine launch values for kernel. -struct launchVals { - int WorkgroupSize; - int GridSize; -}; -launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env, - int ConstWGSize, - llvm::omp::OMPTgtExecModeFlags ExecutionMode, - int num_teams, int thread_limit, - uint64_t loop_tripcount, int DeviceNumTeams) { + if (!e->addr) { + // The host should have always something in the address to + // uniquely identify the target region. + DP("Analyzing host entry '' (size = %lld)...\n", + (unsigned long long)e->size); + return NULL; + } - int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size; - int num_groups = 0; + if (e->size) { + __tgt_offload_entry entry = *e; - int Max_Teams = - Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; - if (Max_Teams > RTLDeviceInfoTy::HardTeamLimit) - Max_Teams = RTLDeviceInfoTy::HardTeamLimit; + void *varptr; + uint32_t varsize; - if (print_kernel_trace & STARTUP_DETAILS) { - DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams); - DP("Max_Teams: %d\n", Max_Teams); - DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize); - DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::Max_WG_Size); - DP("RTLDeviceInfoTy::Default_WG_Size: %d\n", - RTLDeviceInfoTy::Default_WG_Size); - DP("thread_limit: %d\n", thread_limit); - DP("threadsPerGroup: %d\n", threadsPerGroup); - DP("ConstWGSize: %d\n", ConstWGSize); - } - // check for thread_limit() clause - if (thread_limit > 0) { - threadsPerGroup = thread_limit; - DP("Setting threads per block to requested %d\n", thread_limit); - // Add master warp for GENERIC - if (ExecutionMode == - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { - threadsPerGroup += WarpSize; - DP("Adding master wavefront: +%d threads\n", WarpSize); - } - if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max - threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size; - DP("Setting threads per block to maximum %d\n", threadsPerGroup); - } - } - // check flat_max_work_group_size attr here - if (threadsPerGroup > ConstWGSize) { - threadsPerGroup = ConstWGSize; - DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", - threadsPerGroup); - } - if (print_kernel_trace & STARTUP_DETAILS) - DP("threadsPerGroup: %d\n", threadsPerGroup); - DP("Preparing %d threads\n", threadsPerGroup); + auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; + hsa_status_t err = interop_hsa_get_symbol_info( + SymbolInfoMap, device_id, e->name, &varptr, &varsize); - // Set default num_groups (teams) - if (Env.TeamLimit > 0) - num_groups = (Max_Teams < Env.TeamLimit) ? Max_Teams : Env.TeamLimit; - else - num_groups = Max_Teams; - DP("Set default num of groups %d\n", num_groups); + if (err != HSA_STATUS_SUCCESS) { + // Inform the user what symbol prevented offloading + DP("Loading global '%s' (Failed)\n", e->name); + return NULL; + } - if (print_kernel_trace & STARTUP_DETAILS) { - DP("num_groups: %d\n", num_groups); - DP("num_teams: %d\n", num_teams); - } + if (varsize != e->size) { + DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name, + varsize, e->size); + return NULL; + } - // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size - // This reduction is typical for default case (no thread_limit clause). - // or when user goes crazy with num_teams clause. - // FIXME: We cant distinguish between a constant or variable thread limit. - // So we only handle constant thread_limits. - if (threadsPerGroup > - RTLDeviceInfoTy::Default_WG_Size) // 256 < threadsPerGroup <= 1024 - // Should we round threadsPerGroup up to nearest WarpSize - // here? - num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup; + DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", + DPxPTR(e - HostBegin), e->name, DPxPTR(varptr)); + entry.addr = (void *)varptr; - // check for num_teams() clause - if (num_teams > 0) { - num_groups = (num_teams < num_groups) ? num_teams : num_groups; - } - if (print_kernel_trace & STARTUP_DETAILS) { - DP("num_groups: %d\n", num_groups); - DP("Env.NumTeams %d\n", Env.NumTeams); - DP("Env.TeamLimit %d\n", Env.TeamLimit); - } + DeviceInfo.addOffloadEntry(device_id, entry); - if (Env.NumTeams > 0) { - num_groups = (Env.NumTeams < num_groups) ? Env.NumTeams : num_groups; - DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams); - } else if (Env.TeamLimit > 0) { - num_groups = (Env.TeamLimit < num_groups) ? Env.TeamLimit : num_groups; - DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit); - } else { - if (num_teams <= 0) { - if (loop_tripcount > 0) { - if (ExecutionMode == - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) { - // round up to the nearest integer - num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1; - } else if (ExecutionMode == - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { - num_groups = loop_tripcount; - } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ { - // This is a generic kernel that was transformed to use SPMD-mode - // execution but uses Generic-mode semantics for scheduling. - num_groups = loop_tripcount; - } - DP("Using %d teams due to loop trip count %" PRIu64 " and number of " - "threads per block %d\n", - num_groups, loop_tripcount, threadsPerGroup); + if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + e->flags & OMP_DECLARE_TARGET_LINK) { + // If unified memory is present any target link variables + // can access host addresses directly. There is no longer a + // need for device copies. + err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr, + sizeof(void *), device_id); + if (err != HSA_STATUS_SUCCESS) + DP("Error when copying USM\n"); + DP("Copy linked variable host address (" DPxMOD ")" + "to device address (" DPxMOD ")\n", + DPxPTR(*((void **)e->addr)), DPxPTR(varptr)); } - } else { - num_groups = num_teams; - } - if (num_groups > Max_Teams) { - num_groups = Max_Teams; - if (print_kernel_trace & STARTUP_DETAILS) - DP("Limiting num_groups %d to Max_Teams %d \n", num_groups, Max_Teams); + + continue; } - if (num_groups > num_teams && num_teams > 0) { - num_groups = num_teams; - if (print_kernel_trace & STARTUP_DETAILS) - DP("Limiting num_groups %d to clause num_teams %d \n", num_groups, - num_teams); + + DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name)); + + // errors in kernarg_segment_size previously treated as = 0 (or as undef) + uint32_t kernarg_segment_size = 0; + auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id]; + hsa_status_t err = HSA_STATUS_SUCCESS; + if (!e->name) { + err = HSA_STATUS_ERROR; + } else { + std::string kernelStr = std::string(e->name); + auto It = KernelInfoMap.find(kernelStr); + if (It != KernelInfoMap.end()) { + atl_kernel_info_t info = It->second; + kernarg_segment_size = info.kernel_segment_size; + } else { + err = HSA_STATUS_ERROR; + } } - } - // num_teams clause always honored, no matter what, unless DEFAULT is active. - if (num_teams > 0) { - num_groups = num_teams; - // Cap num_groups to EnvMaxTeamsDefault if set. - if (Env.MaxTeamsDefault > 0 && num_groups > Env.MaxTeamsDefault) - num_groups = Env.MaxTeamsDefault; - } - if (print_kernel_trace & STARTUP_DETAILS) { - DP("threadsPerGroup: %d\n", threadsPerGroup); - DP("num_groups: %d\n", num_groups); - DP("loop_tripcount: %ld\n", loop_tripcount); - } - DP("Final %d num_groups and %d threadsPerGroup\n", num_groups, - threadsPerGroup); + // default value GENERIC (in case symbol is missing from cubin file) + llvm::omp::OMPTgtExecModeFlags ExecModeVal = + llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; - launchVals res; - res.WorkgroupSize = threadsPerGroup; - res.GridSize = threadsPerGroup * num_groups; - return res; -} + // get flat group size if present, else Default_WG_Size + int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; -static uint64_t acquire_available_packet_id(hsa_queue_t *queue) { - uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); - bool full = true; - while (full) { - full = - packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue)); - } - return packet_id; -} + // get Kernel Descriptor if present. + // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp + struct KernDescValType { + uint16_t Version; + uint16_t TSize; + uint16_t WG_Size; + }; + struct KernDescValType KernDescVal; + std::string KernDescNameStr(e->name); + KernDescNameStr += "_kern_desc"; + const char *KernDescName = KernDescNameStr.c_str(); -static int32_t __tgt_rtl_run_target_team_region_locked( - int32_t device_id, void *tgt_entry_ptr, void **tgt_args, - ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams, - int32_t thread_limit, uint64_t loop_tripcount); + void *KernDescPtr; + uint32_t KernDescSize; + void *CallStackAddr = nullptr; + err = interop_get_symbol_info((char *)image->ImageStart, img_size, + KernDescName, &KernDescPtr, &KernDescSize); -int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, - ptrdiff_t *tgt_offsets, - int32_t arg_num, int32_t num_teams, - int32_t thread_limit, - uint64_t loop_tripcount) { + if (err == HSA_STATUS_SUCCESS) { + if ((size_t)KernDescSize != sizeof(KernDescVal)) + DP("Loading global computation properties '%s' - size mismatch (%u != " + "%lu)\n", + KernDescName, KernDescSize, sizeof(KernDescVal)); - DeviceInfo.load_run_lock.lock_shared(); - int32_t res = __tgt_rtl_run_target_team_region_locked( - device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams, - thread_limit, loop_tripcount); + memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); - DeviceInfo.load_run_lock.unlock_shared(); - return res; -} + // Check structure size against recorded size. + if ((size_t)KernDescSize != KernDescVal.TSize) + DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", + sizeof(KernDescVal), KernDescVal.TSize, KernDescName); -int32_t __tgt_rtl_run_target_team_region_locked( - int32_t device_id, void *tgt_entry_ptr, void **tgt_args, - ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams, - int32_t thread_limit, uint64_t loop_tripcount) { - // Set the context we are using - // update thread limit content in gpu memory if un-initialized or specified - // from host + DP("After loading global for %s KernDesc \n", KernDescName); + DP("KernDesc: Version: %d\n", KernDescVal.Version); + DP("KernDesc: TSize: %d\n", KernDescVal.TSize); + DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size); - DP("Run target team region thread_limit %d\n", thread_limit); + if (KernDescVal.WG_Size == 0) { + KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size; + DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size); + } + WGSizeVal = KernDescVal.WG_Size; + DP("WGSizeVal %d\n", WGSizeVal); + check("Loading KernDesc computation property", err); + } else { + DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); - // All args are references. - std::vector args(arg_num); - std::vector ptrs(arg_num); + // Flat group size + std::string WGSizeNameStr(e->name); + WGSizeNameStr += "_wg_size"; + const char *WGSizeName = WGSizeNameStr.c_str(); - DP("Arg_num: %d\n", arg_num); - for (int32_t i = 0; i < arg_num; ++i) { - ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); - args[i] = &ptrs[i]; - DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i])); - } + void *WGSizePtr; + uint32_t WGSize; + err = interop_get_symbol_info((char *)image->ImageStart, img_size, + WGSizeName, &WGSizePtr, &WGSize); - KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; + if (err == HSA_STATUS_SUCCESS) { + if ((size_t)WGSize != sizeof(int16_t)) { + DP("Loading global computation properties '%s' - size mismatch (%u " + "!= " + "%lu)\n", + WGSizeName, WGSize, sizeof(int16_t)); + return NULL; + } - std::string kernel_name = std::string(KernelInfo->Name); - auto &KernelInfoTable = DeviceInfo.KernelInfoTable; - if (KernelInfoTable[device_id].find(kernel_name) == - KernelInfoTable[device_id].end()) { - DP("Kernel %s not found\n", kernel_name.c_str()); - return OFFLOAD_FAIL; - } + memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); - const atl_kernel_info_t KernelInfoEntry = - KernelInfoTable[device_id][kernel_name]; - const uint32_t group_segment_size = KernelInfoEntry.group_segment_size; - const uint32_t sgpr_count = KernelInfoEntry.sgpr_count; - const uint32_t vgpr_count = KernelInfoEntry.vgpr_count; - const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count; - const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count; + DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); + + if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size || + WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) { + DP("Error wrong WGSize value specified in HSA code object file: " + "%d\n", + WGSizeVal); + WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; + } + } else { + DP("Warning: Loading WGSize '%s' - symbol not found, " + "using default value %d\n", + WGSizeName, WGSizeVal); + } - assert(arg_num == (int)KernelInfoEntry.explicit_argument_count); + check("Loading WGSize computation property", err); + } - /* - * Set limit based on ThreadsPerGroup and GroupsPerDevice - */ - launchVals LV = - getLaunchVals(DeviceInfo.WarpSize[device_id], DeviceInfo.Env, - KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, - num_teams, // From run_region arg - thread_limit, // From run_region arg - loop_tripcount, // From run_region arg - DeviceInfo.NumTeams[KernelInfo->device_id]); - const int GridSize = LV.GridSize; - const int WorkgroupSize = LV.WorkgroupSize; + // Read execution mode from global in binary + std::string ExecModeNameStr(e->name); + ExecModeNameStr += "_exec_mode"; + const char *ExecModeName = ExecModeNameStr.c_str(); - if (print_kernel_trace >= LAUNCH) { - int num_groups = GridSize / WorkgroupSize; - // enum modes are SPMD, GENERIC, NONE 0,1,2 - // if doing rtl timing, print to stderr, unless stdout requested. - bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); - fprintf(traceToStdout ? stdout : stderr, - "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " - "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " - "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", - device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, - arg_num, num_groups, WorkgroupSize, num_teams, thread_limit, - group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count, - vgpr_spill_count, loop_tripcount, KernelInfo->Name); - } + void *ExecModePtr; + uint32_t varsize; + err = interop_get_symbol_info((char *)image->ImageStart, img_size, + ExecModeName, &ExecModePtr, &varsize); - // Run on the device. - { - hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get(); - if (!queue) { - return OFFLOAD_FAIL; - } - uint64_t packet_id = acquire_available_packet_id(queue); + if (err == HSA_STATUS_SUCCESS) { + if ((size_t)varsize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { + DP("Loading global computation properties '%s' - size mismatch(%u != " + "%lu)\n", + ExecModeName, varsize, sizeof(llvm::omp::OMPTgtExecModeFlags)); + return NULL; + } - const uint32_t mask = queue->size - 1; // size is a power of 2 - hsa_kernel_dispatch_packet_t *packet = - (hsa_kernel_dispatch_packet_t *)queue->base_address + - (packet_id & mask); + memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize); - // packet->header is written last - packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - packet->workgroup_size_x = WorkgroupSize; - packet->workgroup_size_y = 1; - packet->workgroup_size_z = 1; - packet->reserved0 = 0; - packet->grid_size_x = GridSize; - packet->grid_size_y = 1; - packet->grid_size_z = 1; - packet->private_segment_size = KernelInfoEntry.private_segment_size; - packet->group_segment_size = KernelInfoEntry.group_segment_size; - packet->kernel_object = KernelInfoEntry.kernel_object; - packet->kernarg_address = 0; // use the block allocator - packet->reserved2 = 0; // impl writes id_ here - packet->completion_signal = {0}; // may want a pool of signals + DP("After loading global for %s ExecMode = %d\n", ExecModeName, + ExecModeVal); - KernelArgPool *ArgPool = nullptr; - void *kernarg = nullptr; - { - auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name)); - if (it != KernelArgPoolMap.end()) { - ArgPool = (it->second).get(); + if (ExecModeVal < 0 || + ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { + DP("Error wrong exec_mode value specified in HSA code object file: " + "%d\n", + ExecModeVal); + return NULL; } + } else { + DP("Loading global exec_mode '%s' - symbol missing, using default " + "value " + "GENERIC (1)\n", + ExecModeName); } - if (!ArgPool) { - DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, - device_id); - } - { - if (ArgPool) { - assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *))); - kernarg = ArgPool->allocate(arg_num); - } - if (!kernarg) { - DP("Allocate kernarg failed\n"); - return OFFLOAD_FAIL; - } + check("Loading computation property", err); - // Copy explicit arguments - for (int i = 0; i < arg_num; i++) { - memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *)); - } + KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id, + CallStackAddr, e->name, kernarg_segment_size, + DeviceInfo.KernArgPool)); + __tgt_offload_entry entry = *e; + entry.addr = (void *)&KernelsList.back(); + DeviceInfo.addOffloadEntry(device_id, entry); + DP("Entry point %ld maps to %s\n", e - HostBegin, e->name); + } - // Initialize implicit arguments. TODO: Which of these can be dropped - impl_implicit_args_t *impl_args = - reinterpret_cast( - static_cast(kernarg) + ArgPool->kernarg_segment_size); - memset(impl_args, 0, - sizeof(impl_implicit_args_t)); // may not be necessary - impl_args->offset_x = 0; - impl_args->offset_y = 0; - impl_args->offset_z = 0; + return DeviceInfo.getOffloadEntriesTable(device_id); +} - // assign a hostcall buffer for the selected Q - if (__atomic_load_n(&DeviceInfo.hostcall_required, __ATOMIC_ACQUIRE)) { - // hostrpc_assign_buffer is not thread safe, and this function is - // under a multiple reader lock, not a writer lock. - static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER; - pthread_mutex_lock(&hostcall_init_lock); - unsigned long buffer = hostrpc_assign_buffer( - DeviceInfo.HSAAgents[device_id], queue, device_id); - pthread_mutex_unlock(&hostcall_init_lock); - if (!buffer) { - DP("hostrpc_assign_buffer failed, gpu would dereference null and " - "error\n"); - return OFFLOAD_FAIL; - } +void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) { + void *ptr = NULL; + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); - if (KernelInfoEntry.implicit_argument_count >= 4) { - // Initialise pointer for implicit_argument_count != 0 ABI - // Guess that the right implicit argument is at offset 24 after - // the explicit arguments. In the future, should be able to read - // the offset from msgpack. Clang is not annotating it at present. - uint64_t Offset = - sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); - if ((Offset + 8) > (ArgPool->kernarg_segment_size)) { - DP("Bad offset of hostcall, exceeds kernarg segment size\n"); - } else { - memcpy(static_cast(kernarg) + Offset, &buffer, 8); - } - } + if (kind != TARGET_ALLOC_DEFAULT) { + REPORT("Invalid target data allocation kind or requested allocator not " + "implemented yet\n"); + return NULL; + } - // initialise pointer for implicit_argument_count == 0 ABI - impl_args->hostcall_ptr = buffer; - } + hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(device_id); + hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, &ptr); + DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size, + (long long unsigned)(Elf64_Addr)ptr); + ptr = (err == HSA_STATUS_SUCCESS) ? ptr : NULL; + return ptr; +} - packet->kernarg_address = kernarg; - } +int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + __tgt_async_info AsyncInfo; + int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; - hsa_signal_t s = DeviceInfo.FreeSignalPool.pop(); - if (s.handle == 0) { - DP("Failed to get signal instance\n"); - return OFFLOAD_FAIL; - } - packet->completion_signal = s; - hsa_signal_store_relaxed(packet->completion_signal, 1); + return __tgt_rtl_synchronize(device_id, &AsyncInfo); +} - // Publish the packet indicating it is ready to be processed - core::packet_store_release(reinterpret_cast(packet), - core::create_header(), packet->setup); +int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr, + int64_t size, __tgt_async_info *AsyncInfo) { + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + if (AsyncInfo) { + initAsyncInfo(AsyncInfo); + return dataSubmit(device_id, tgt_ptr, hst_ptr, size, AsyncInfo); + } else { + return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size); + } +} - // Since the packet is already published, its contents must not be - // accessed any more - hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); +int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + __tgt_async_info AsyncInfo; + int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; - while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, - HSA_WAIT_STATE_BLOCKED) != 0) - ; + return __tgt_rtl_synchronize(device_id, &AsyncInfo); +} - assert(ArgPool); - ArgPool->deallocate(kernarg); - DeviceInfo.FreeSignalPool.push(s); - } +int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr, + void *tgt_ptr, int64_t size, + __tgt_async_info *AsyncInfo) { + assert(AsyncInfo && "AsyncInfo is nullptr"); + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + initAsyncInfo(AsyncInfo); + return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, AsyncInfo); +} - DP("Kernel completed\n"); +int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) { + assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + hsa_status_t err; + DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr); + err = core::Runtime::Memfree(tgt_ptr); + if (err != HSA_STATUS_SUCCESS) { + DP("Error when freeing CUDA memory\n"); + return OFFLOAD_FAIL; + } return OFFLOAD_SUCCESS; } +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t num_teams, + int32_t thread_limit, + uint64_t loop_tripcount) { + + DeviceInfo.load_run_lock.lock_shared(); + int32_t res = __tgt_rtl_run_target_team_region_locked( + device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams, + thread_limit, loop_tripcount); + + DeviceInfo.load_run_lock.unlock_shared(); + return res; +} + int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {