From a0da24683b2bf40a3c16077834ca2ac98622591c Mon Sep 17 00:00:00 2001 From: George Rokos Date: Thu, 19 Jul 2018 13:41:03 +0000 Subject: [PATCH] [OpenMP][libomptarget] New map interface: remove translation code and ensure proper alignment of struct members This patch removes the translation code since this functionality is now implemented in the compiler. target_data_begin and target_data_end are also patched to handle some special cases that used to be handled by the obsolete translation function, namely ensure proper alignment of struct members when we have partially mapped structs. Mapping a struct from a higher address (i.e. not from its beginning) can result in distortion of the alignment for some of its member fields. Padding restores the original (proper) alignment. Differential revision: https://reviews.llvm.org/D44186 llvm-svn: 337455 --- openmp/libomptarget/include/omptarget.h | 2 +- openmp/libomptarget/src/interface.cpp | 355 +++++--------------------------- openmp/libomptarget/src/omptarget.cpp | 118 ++++++++--- 3 files changed, 145 insertions(+), 330 deletions(-) diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h index 7135257..e92a94b 100644 --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -48,7 +48,7 @@ enum tgt_map_type { OMP_TGT_MAPTYPE_LITERAL = 0x100, // mapping is implicit OMP_TGT_MAPTYPE_IMPLICIT = 0x200, - // member of struct, member given by 16 MSBs - 1 + // member of struct, member given by [16 MSBs] - 1 OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000 }; diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index a48d667..0f32f4e 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -33,265 +33,36 @@ EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) { RTLs.UnregisterLib(desc); } -// Following datatypes and functions (tgt_oldmap_type, combined_entry_t, -// translate_map, cleanup_map) will be removed once the compiler starts using -// the new map types. - -// Old map types -enum tgt_oldmap_type { - OMP_TGT_OLDMAPTYPE_TO = 0x001, // copy data from host to device - OMP_TGT_OLDMAPTYPE_FROM = 0x002, // copy data from device to host - OMP_TGT_OLDMAPTYPE_ALWAYS = 0x004, // copy regardless of the ref. count - OMP_TGT_OLDMAPTYPE_DELETE = 0x008, // force unmapping of data - OMP_TGT_OLDMAPTYPE_MAP_PTR = 0x010, // map pointer as well as pointee - OMP_TGT_OLDMAPTYPE_FIRST_MAP = 0x020, // first occurrence of mapped variable - OMP_TGT_OLDMAPTYPE_RETURN_PTR = 0x040, // return TgtBase addr of mapped data - OMP_TGT_OLDMAPTYPE_PRIVATE_PTR = 0x080, // private variable - not mapped - OMP_TGT_OLDMAPTYPE_PRIVATE_VAL = 0x100 // copy by value - not mapped -}; - -// Temporary functions for map translation and cleanup -struct combined_entry_t { - int num_members; // number of members in combined entry - void *base_addr; // base address of combined entry - void *begin_addr; // begin address of combined entry - void *end_addr; // size of combined entry -}; - -static void translate_map(int32_t arg_num, void **args_base, void **args, - int64_t *arg_sizes, int64_t *arg_types, int32_t &new_arg_num, - void **&new_args_base, void **&new_args, int64_t *&new_arg_sizes, - int64_t *&new_arg_types, bool is_target_construct) { - if (arg_num <= 0) { - DP("Nothing to translate\n"); - new_arg_num = 0; - return; - } - - // array of combined entries - combined_entry_t *cmb_entries = - (combined_entry_t *) alloca(arg_num * sizeof(combined_entry_t)); - // number of combined entries - long num_combined = 0; - // old entry is MAP_PTR? - bool *is_ptr_old = (bool *) alloca(arg_num * sizeof(bool)); - // old entry is member of member_of[old] cmb_entry - int *member_of = (int *) alloca(arg_num * sizeof(int)); - // temporary storage for modifications of the original arg_types - int64_t *mod_arg_types = (int64_t *) alloca(arg_num *sizeof(int64_t)); - - DP("Translating %d map entries\n", arg_num); - for (int i = 0; i < arg_num; ++i) { - member_of[i] = -1; - is_ptr_old[i] = false; - mod_arg_types[i] = arg_types[i]; - // Scan previous entries to see whether this entry shares the same base - for (int j = 0; j < i; ++j) { - void *new_begin_addr = NULL; - void *new_end_addr = NULL; - - if (mod_arg_types[i] & OMP_TGT_OLDMAPTYPE_MAP_PTR) { - if (args_base[i] == args[j]) { - if (!(mod_arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR)) { - DP("Entry %d has the same base as entry %d's begin address\n", i, - j); - new_begin_addr = args_base[i]; - new_end_addr = (char *)args_base[i] + sizeof(void *); - assert(arg_sizes[j] == sizeof(void *)); - is_ptr_old[j] = true; - } else { - DP("Entry %d has the same base as entry %d's begin address, but " - "%d's base was a MAP_PTR too\n", i, j, j); - int32_t to_from_always_delete = - OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM | - OMP_TGT_OLDMAPTYPE_ALWAYS | OMP_TGT_OLDMAPTYPE_DELETE; - if (mod_arg_types[j] & to_from_always_delete) { - DP("Resetting to/from/always/delete flags for entry %d because " - "it is only a pointer to pointer\n", j); - mod_arg_types[j] &= ~to_from_always_delete; - } - } - } - } else { - if (!(mod_arg_types[i] & OMP_TGT_OLDMAPTYPE_FIRST_MAP) && - args_base[i] == args_base[j]) { - DP("Entry %d has the same base address as entry %d\n", i, j); - new_begin_addr = args[i]; - new_end_addr = (char *)args[i] + arg_sizes[i]; - } - } - - // If we have combined the entry with a previous one - if (new_begin_addr) { - int id; - if(member_of[j] == -1) { - // We have a new entry - id = num_combined++; - DP("Creating new combined entry %d for old entry %d\n", id, j); - // Initialize new entry - cmb_entries[id].num_members = 1; - cmb_entries[id].base_addr = args_base[j]; - if (mod_arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR) { - cmb_entries[id].begin_addr = args_base[j]; - cmb_entries[id].end_addr = (char *)args_base[j] + arg_sizes[j]; - } else { - cmb_entries[id].begin_addr = args[j]; - cmb_entries[id].end_addr = (char *)args[j] + arg_sizes[j]; - } - member_of[j] = id; - } else { - // Reuse existing combined entry - DP("Reusing existing combined entry %d\n", member_of[j]); - id = member_of[j]; - } - - // Update combined entry - DP("Adding entry %d to combined entry %d\n", i, id); - cmb_entries[id].num_members++; - // base_addr stays the same - cmb_entries[id].begin_addr = - std::min(cmb_entries[id].begin_addr, new_begin_addr); - cmb_entries[id].end_addr = - std::max(cmb_entries[id].end_addr, new_end_addr); - member_of[i] = id; - break; - } - } - } - - DP("New entries: %ld combined + %d original\n", num_combined, arg_num); - new_arg_num = arg_num + num_combined; - new_args_base = (void **) malloc(new_arg_num * sizeof(void *)); - new_args = (void **) malloc(new_arg_num * sizeof(void *)); - new_arg_sizes = (int64_t *) malloc(new_arg_num * sizeof(int64_t)); - new_arg_types = (int64_t *) malloc(new_arg_num * sizeof(int64_t)); - - const int64_t alignment = 8; - - int next_id = 0; // next ID - int next_cid = 0; // next combined ID - int *combined_to_new_id = (int *) alloca(num_combined * sizeof(int)); - for (int i = 0; i < arg_num; ++i) { - // It is member_of - if (member_of[i] == next_cid) { - int cid = next_cid++; // ID of this combined entry - int nid = next_id++; // ID of the new (global) entry - combined_to_new_id[cid] = nid; - DP("Combined entry %3d will become new entry %3d\n", cid, nid); - - int64_t padding = (int64_t)cmb_entries[cid].begin_addr % alignment; - if (padding) { - DP("Using a padding of %" PRId64 " for begin address " DPxMOD "\n", - padding, DPxPTR(cmb_entries[cid].begin_addr)); - cmb_entries[cid].begin_addr = - (char *)cmb_entries[cid].begin_addr - padding; - } - - new_args_base[nid] = cmb_entries[cid].base_addr; - new_args[nid] = cmb_entries[cid].begin_addr; - new_arg_sizes[nid] = (int64_t) ((char *)cmb_entries[cid].end_addr - - (char *)cmb_entries[cid].begin_addr); - new_arg_types[nid] = OMP_TGT_MAPTYPE_TARGET_PARAM; - DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", " - "size %" PRId64 ", type 0x%" PRIx64 "\n", nid, - DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid], - new_arg_types[nid]); - } else if (member_of[i] != -1) { - DP("Combined entry %3d has been encountered before, do nothing\n", - member_of[i]); - } - - // Now that the combined entry (the one the old entry was a member of) has - // been inserted into the new arguments list, proceed with the old entry. - int nid = next_id++; - DP("Old entry %3d will become new entry %3d\n", i, nid); - - new_args_base[nid] = args_base[i]; - new_args[nid] = args[i]; - new_arg_sizes[nid] = arg_sizes[i]; - int64_t old_type = mod_arg_types[i]; - - if (is_ptr_old[i]) { - // Reset TO and FROM flags - old_type &= ~(OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM); - } - - if (member_of[i] == -1) { - if (!is_target_construct) - old_type &= ~OMP_TGT_MAPTYPE_TARGET_PARAM; - new_arg_types[nid] = old_type; - DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64 - ", type 0x%" PRIx64 " (old entry %d not MEMBER_OF)\n", nid, - DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid], - new_arg_types[nid], i); - } else { - // Old entry is not FIRST_MAP - old_type &= ~OMP_TGT_OLDMAPTYPE_FIRST_MAP; - // Add MEMBER_OF - int new_member_of = combined_to_new_id[member_of[i]]; - old_type |= ((int64_t)new_member_of + 1) << 48; - new_arg_types[nid] = old_type; - DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64 - ", type 0x%" PRIx64 " (old entry %d MEMBER_OF %d)\n", nid, - DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid], - new_arg_types[nid], i, new_member_of); - } - } -} - -static void cleanup_map(int32_t new_arg_num, void **new_args_base, - void **new_args, int64_t *new_arg_sizes, int64_t *new_arg_types, - int32_t arg_num, void **args_base) { - if (new_arg_num > 0) { - int offset = new_arg_num - arg_num; - for (int32_t i = 0; i < arg_num; ++i) { - // Restore old base address - args_base[i] = new_args_base[i+offset]; - } - free(new_args_base); - free(new_args); - free(new_arg_sizes); - free(new_arg_types); - } -} - /// creates host-to-target data mapping, stores it in the /// libomptarget.so internal structure (an entry in a stack of data maps) /// and passes the data to the device. EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - DP("Entering data begin region for device %ld with %d mappings\n", device_id, - arg_num); + DP("Entering data begin region for device %" PRId64 " with %d mappings\n", + device_id, arg_num); // No devices available? if (device_id == OFFLOAD_DEVICE_DEFAULT) { device_id = omp_get_default_device(); - DP("Use default device id %ld\n", device_id); + DP("Use default device id %" PRId64 "\n", device_id); } if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - DP("Failed to get device %ld ready\n", device_id); + DP("Failed to get device %" PRId64 " ready\n", device_id); return; } DeviceTy& Device = Devices[device_id]; - // Translate maps - int32_t new_arg_num; - void **new_args_base; - void **new_args; - int64_t *new_arg_sizes; - int64_t *new_arg_types; - translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num, - new_args_base, new_args, new_arg_sizes, new_arg_types, false); - - //target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types); - target_data_begin(Device, new_arg_num, new_args_base, new_args, new_arg_sizes, - new_arg_types); - - // Cleanup translation memory - cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes, - new_arg_types, arg_num, args_base); +#ifdef OMPTARGET_DEBUG + for (int i=0; i> 48) - 1; } @@ -189,10 +221,33 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num, void *HstPtrBegin = args[i]; void *HstPtrBase = args_base[i]; + int64_t data_size = arg_sizes[i]; + + // Adjust for proper alignment if this is a combined entry (for structs). + // Look at the next argument - if that is MEMBER_OF this one, then this one + // is a combined entry. + int64_t padding = 0; + const int next_i = i+1; + if (member_of(arg_types[i]) < 0 && next_i < arg_num && + member_of(arg_types[next_i]) == i) { + padding = (int64_t)HstPtrBegin % alignment; + if (padding) { + DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD + "\n", padding, DPxPTR(HstPtrBegin)); + HstPtrBegin = (char *) HstPtrBegin - padding; + data_size += padding; + } + } + // Address of pointer on the host and device, respectively. void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin; bool IsNew, Pointer_IsNew; bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; + // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we + // have reached this point via __tgt_target_data_begin and not __tgt_target + // then no argument is marked as TARGET_PARAM ("omp target data map" is not + // associated with a target region, so there are no target parameters). This + // may be considered a hack, we could revise the scheme in the future. bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF); if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { DP("Has a pointer entry: \n"); @@ -213,28 +268,22 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num, } void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase, - arg_sizes[i], IsNew, IsImplicit, UpdateRef); - if (!TgtPtrBegin && arg_sizes[i]) { - // If arg_sizes[i]==0, then the argument is a pointer to NULL, so - // getOrAlloc() returning NULL is not an error. + data_size, IsNew, IsImplicit, UpdateRef); + if (!TgtPtrBegin && data_size) { + // If data_size==0, then the argument could be a zero-length pointer to + // NULL, so getOrAlloc() returning NULL is not an error. DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " "illegal mapping).\n"); } DP("There are %" PRId64 " bytes allocated at target address " DPxMOD - " - is%s new\n", arg_sizes[i], DPxPTR(TgtPtrBegin), + " - is%s new\n", data_size, DPxPTR(TgtPtrBegin), (IsNew ? "" : " not")); if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { - void *ret_ptr; - if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) - ret_ptr = Pointer_TgtPtrBegin; - else { - bool IsLast; // not used - ret_ptr = Device.getTgtPtrBegin(HstPtrBegin, 0, IsLast, false); - } - - DP("Returning device pointer " DPxMOD "\n", DPxPTR(ret_ptr)); - args_base[i] = ret_ptr; + uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; + void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); + DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)); + args_base[i] = TgtPtrBase; } if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { @@ -243,7 +292,7 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num, copy = true; } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) { // Copy data only if the "parent" struct has RefCount==1. - short parent_idx = member_of(arg_types[i]); + int32_t parent_idx = member_of(arg_types[i]); long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); assert(parent_rc > 0 && "parent struct not found"); if (parent_rc == 1) { @@ -253,8 +302,8 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num, if (copy) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", - arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]); + data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); rc = OFFLOAD_FAIL; @@ -297,16 +346,33 @@ int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, continue; void *HstPtrBegin = args[i]; + int64_t data_size = arg_sizes[i]; + // Adjust for proper alignment if this is a combined entry (for structs). + // Look at the next argument - if that is MEMBER_OF this one, then this one + // is a combined entry. + int64_t padding = 0; + const int next_i = i+1; + if (member_of(arg_types[i]) < 0 && next_i < arg_num && + member_of(arg_types[next_i]) == i) { + padding = (int64_t)HstPtrBegin % alignment; + if (padding) { + DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD + "\n", padding, DPxPTR(HstPtrBegin)); + HstPtrBegin = (char *) HstPtrBegin - padding; + data_size += padding; + } + } + bool IsLast; bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) || (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ); bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE; // If PTR_AND_OBJ, HstPtrBegin is address of pointee - void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast, + void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast, UpdateRef); DP("There are %" PRId64 " bytes allocated at target address " DPxMOD - " - is%s last\n", arg_sizes[i], DPxPTR(TgtPtrBegin), + " - is%s last\n", data_size, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not")); bool DelEntry = IsLast || ForceDelete; @@ -324,7 +390,7 @@ int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { // Copy data only if the "parent" struct has RefCount==1. - short parent_idx = member_of(arg_types[i]); + int32_t parent_idx = member_of(arg_types[i]); long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); assert(parent_rc > 0 && "parent struct not found"); if (parent_rc == 1) { @@ -334,8 +400,8 @@ int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, if (DelEntry || Always || CopyMember) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", - arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, arg_sizes[i]); + data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size); if (rt != OFFLOAD_SUCCESS) { DP("Copying data from device failed.\n"); rc = OFFLOAD_FAIL; @@ -348,7 +414,7 @@ int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, // copies. If the struct is going to be deallocated, remove any remaining // shadow pointer entries for this struct. uintptr_t lb = (uintptr_t) HstPtrBegin; - uintptr_t ub = (uintptr_t) HstPtrBegin + arg_sizes[i]; + uintptr_t ub = (uintptr_t) HstPtrBegin + data_size; Device.ShadowMtx.lock(); for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); it != Device.ShadowPtrMap.end(); ++it) { @@ -378,7 +444,7 @@ int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, // Deallocate map if (DelEntry) { - int rt = Device.deallocTgtPtr(HstPtrBegin, arg_sizes[i], ForceDelete); + int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete); if (rt != OFFLOAD_SUCCESS) { DP("Deallocating data from device failed.\n"); rc = OFFLOAD_FAIL; -- 2.7.4