##===----------------------------------------------------------------------===##
-#
+#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
+#
##===----------------------------------------------------------------------===##
#
# Build offloading library libomptarget.so.
api.cpp
device.cpp
interface.cpp
+ MemoryManager.cpp
rtl.cpp
omptarget.cpp
)
--- /dev/null
+//===----------- MemoryManager.cpp - Target independent memory manager ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Functionality for managing target memory.
+// It is very expensive to call alloc/free functions of target devices. The
+// MemoryManagerTy in this file is to reduce the number of invocations of those
+// functions by buffering allocated device memory. In this way, when a memory is
+// not used, it will not be freed on the device directly. The buffer is
+// organized in a number of buckets for efficient look up. A memory will go to
+// corresponding bucket based on its size. When a new memory request comes in,
+// it will first check whether there is free memory of same size. If yes,
+// returns it directly. Otherwise, allocate one on device.
+//
+// It also provides a way to opt out the memory manager. Memory
+// allocation/deallocation will only be managed if the requested size is less
+// than SizeThreshold, which can be configured via an environment variable
+// LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MemoryManager.h"
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+namespace {
+constexpr const size_t BucketSize[] = {
+ 0, 1U << 2, 1U << 3, 1U << 4, 1U << 5, 1U << 6, 1U << 7,
+ 1U << 8, 1U << 9, 1U << 10, 1U << 11, 1U << 12, 1U << 13};
+
+constexpr const int NumBuckets = sizeof(BucketSize) / sizeof(BucketSize[0]);
+
+/// The threshold to manage memory using memory manager. If the request size is
+/// larger than \p SizeThreshold, the allocation will not be managed by the
+/// memory manager. This variable can be configured via an env \p
+/// LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD. By default, the value is 8KB.
+size_t SizeThreshold = 1U << 13;
+
+/// Find the previous number that is power of 2 given a number that is not power
+/// of 2.
+size_t floorToPowerOfTwo(size_t Num) {
+ Num |= Num >> 1;
+ Num |= Num >> 2;
+ Num |= Num >> 4;
+ Num |= Num >> 8;
+ Num |= Num >> 16;
+ Num |= Num >> 32;
+ Num += 1;
+ return Num >> 1;
+}
+
+/// Find a suitable bucket
+int findBucket(size_t Size) {
+ const size_t F = floorToPowerOfTwo(Size);
+
+ DP("findBucket: Size %zu is floored to %zu.\n", Size, F);
+
+ int L = 0, H = NumBuckets - 1;
+ while (H - L > 1) {
+ int M = (L + H) >> 1;
+ if (BucketSize[M] == F)
+ return M;
+ if (BucketSize[M] > F)
+ H = M - 1;
+ else
+ L = M;
+ }
+
+ assert(L >= 0 && L < NumBuckets && "L is out of range");
+
+ DP("findBucket: Size %zu goes to bucket %d\n", Size, L);
+
+ return L;
+}
+} // namespace
+
+MemoryManagerTy::MemoryManagerTy(DeviceTy &Dev, size_t Threshold)
+ : FreeLists(NumBuckets), FreeListLocks(NumBuckets), Device(Dev) {
+ if (Threshold)
+ SizeThreshold = Threshold;
+}
+
+MemoryManagerTy::~MemoryManagerTy() {
+ // TODO: There is a little issue that target plugin is destroyed before this
+ // object, therefore the memory free will not succeed.
+ // Deallocate all memory in map
+ for (auto Itr = PtrToNodeTable.begin(); Itr != PtrToNodeTable.end(); ++Itr) {
+ assert(Itr->second.Ptr && "nullptr in map table");
+ deleteOnDevice(Itr->second.Ptr);
+ }
+}
+
+void *MemoryManagerTy::allocateOnDevice(size_t Size, void *HstPtr) const {
+ return Device.RTL->data_alloc(Device.RTLDeviceID, Size, HstPtr);
+}
+
+int MemoryManagerTy::deleteOnDevice(void *Ptr) const {
+ return Device.RTL->data_delete(Device.RTLDeviceID, Ptr);
+}
+
+void *MemoryManagerTy::freeAndAllocate(size_t Size, void *HstPtr) {
+ std::vector<void *> RemoveList;
+
+ // Deallocate all memory in FreeList
+ for (int I = 0; I < NumBuckets; ++I) {
+ FreeListTy &List = FreeLists[I];
+ std::lock_guard<std::mutex> Lock(FreeListLocks[I]);
+ if (List.empty())
+ continue;
+ for (const NodeTy &N : List) {
+ deleteOnDevice(N.Ptr);
+ RemoveList.push_back(N.Ptr);
+ }
+ FreeLists[I].clear();
+ }
+
+ // Remove all nodes in the map table which have been released
+ if (!RemoveList.empty()) {
+ std::lock_guard<std::mutex> LG(MapTableLock);
+ for (void *P : RemoveList)
+ PtrToNodeTable.erase(P);
+ }
+
+ // Try allocate memory again
+ return allocateOnDevice(Size, HstPtr);
+}
+
+void *MemoryManagerTy::allocateOrFreeAndAllocateOnDevice(size_t Size,
+ void *HstPtr) {
+ void *TgtPtr = allocateOnDevice(Size, HstPtr);
+ // We cannot get memory from the device. It might be due to OOM. Let's
+ // free all memory in FreeLists and try again.
+ if (TgtPtr == nullptr) {
+ DP("Failed to get memory on device. Free all memory in FreeLists and "
+ "try again.\n");
+ TgtPtr = freeAndAllocate(Size, HstPtr);
+ }
+
+#ifdef OMPTARGET_DEBUG
+ if (TgtPtr == nullptr)
+ DP("Still cannot get memory on device probably because the device is "
+ "OOM.\n");
+#endif
+
+ return TgtPtr;
+}
+
+void *MemoryManagerTy::allocate(size_t Size, void *HstPtr) {
+ // If the size is zero, we will not bother the target device. Just return
+ // nullptr directly.
+ if (Size == 0)
+ return nullptr;
+
+ DP("MemoryManagerTy::allocate: size %zu with host pointer " DPxMOD ".\n",
+ Size, DPxPTR(HstPtr));
+
+ // If the size is greater than the threshold, allocate it directly from
+ // device.
+ if (Size > SizeThreshold) {
+ DP("%zu is greater than the threshold %zu. Allocate it directly from "
+ "device\n",
+ Size, SizeThreshold);
+ void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr);
+
+ DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr));
+
+ return TgtPtr;
+ }
+
+ NodeTy *NodePtr = nullptr;
+
+ // Try to get a node from FreeList
+ {
+ const int B = findBucket(Size);
+ FreeListTy &List = FreeLists[B];
+
+ NodeTy TempNode(Size, nullptr);
+ std::lock_guard<std::mutex> LG(FreeListLocks[B]);
+ FreeListTy::const_iterator Itr = List.find(TempNode);
+
+ if (Itr != List.end()) {
+ NodePtr = &Itr->get();
+ List.erase(Itr);
+ }
+ }
+
+#ifdef OMPTARGET_DEBUG
+ if (NodePtr != nullptr)
+ DP("Find one node " DPxMOD " in the bucket.\n", DPxPTR(NodePtr));
+#endif
+
+ // We cannot find a valid node in FreeLists. Let's allocate on device and
+ // create a node for it.
+ if (NodePtr == nullptr) {
+ DP("Cannot find a node in the FreeLists. Allocate on device.\n");
+ // Allocate one on device
+ void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr);
+
+ if (TgtPtr == nullptr)
+ return nullptr;
+
+ // Create a new node and add it into the map table
+ {
+ std::lock_guard<std::mutex> Guard(MapTableLock);
+ auto Itr = PtrToNodeTable.emplace(TgtPtr, NodeTy(Size, TgtPtr));
+ NodePtr = &Itr.first->second;
+ }
+
+ DP("Node address " DPxMOD ", target pointer " DPxMOD ", size %zu\n",
+ DPxPTR(NodePtr), DPxPTR(TgtPtr), Size);
+ }
+
+ assert(NodePtr && "NodePtr should not be nullptr at this point");
+
+ return NodePtr->Ptr;
+}
+
+int MemoryManagerTy::free(void *TgtPtr) {
+ DP("MemoryManagerTy::free: target memory " DPxMOD ".\n", DPxPTR(TgtPtr));
+
+ NodeTy *P = nullptr;
+
+ // Look it up into the table
+ {
+ std::lock_guard<std::mutex> G(MapTableLock);
+ auto Itr = PtrToNodeTable.find(TgtPtr);
+
+ // We don't remove the node from the map table because the map does not
+ // change.
+ if (Itr != PtrToNodeTable.end())
+ P = &Itr->second;
+ }
+
+ // The memory is not managed by the manager
+ if (P == nullptr) {
+ DP("Cannot find its node. Delete it on device directly.\n");
+ return deleteOnDevice(TgtPtr);
+ }
+
+ // Insert the node to the free list
+ const int B = findBucket(P->Size);
+
+ DP("Found its node " DPxMOD ". Insert it to bucket %d.\n", DPxPTR(P), B);
+
+ {
+ std::lock_guard<std::mutex> G(FreeListLocks[B]);
+ FreeLists[B].insert(*P);
+ }
+
+ return OFFLOAD_SUCCESS;
+}
--- /dev/null
+//===----------- MemoryManager.h - Target independent memory manager ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Declarations for target independent memory manager.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OPENMP_LIBOMPTARGET_SRC_MEMORYMANAGER_H
+#define LLVM_OPENMP_LIBOMPTARGET_SRC_MEMORYMANAGER_H
+
+#include <cassert>
+#include <functional>
+#include <list>
+#include <mutex>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+// Forward declaration
+struct DeviceTy;
+
+class MemoryManagerTy {
+ /// A structure stores the meta data of a target pointer
+ struct NodeTy {
+ /// Memory size
+ const size_t Size;
+ /// Target pointer
+ void *Ptr;
+
+ /// Constructor
+ NodeTy(size_t Size, void *Ptr) : Size(Size), Ptr(Ptr) {}
+ };
+
+ /// To make \p NodePtrTy ordered when they're put into \p std::multiset.
+ struct NodeCmpTy {
+ bool operator()(const NodeTy &LHS, const NodeTy &RHS) const {
+ return LHS.Size < RHS.Size;
+ }
+ };
+
+ /// A \p FreeList is a set of Nodes. We're using \p std::multiset here to make
+ /// the look up procedure more efficient.
+ using FreeListTy = std::multiset<std::reference_wrapper<NodeTy>, NodeCmpTy>;
+
+ /// A list of \p FreeListTy entries, each of which is a \p std::multiset of
+ /// Nodes whose size is less or equal to a specific bucket size.
+ std::vector<FreeListTy> FreeLists;
+ /// A list of mutex for each \p FreeListTy entry
+ std::vector<std::mutex> FreeListLocks;
+ /// A table to map from a target pointer to its node
+ std::unordered_map<void *, NodeTy> PtrToNodeTable;
+ /// The mutex for the table \p PtrToNodeTable
+ std::mutex MapTableLock;
+ /// A reference to its corresponding \p DeviceTy object
+ DeviceTy &Device;
+
+ /// Request memory from target device
+ void *allocateOnDevice(size_t Size, void *HstPtr) const;
+
+ /// Deallocate data on device
+ int deleteOnDevice(void *Ptr) const;
+
+ /// This function is called when it tries to allocate memory on device but the
+ /// device returns out of memory. It will first free all memory in the
+ /// FreeList and try to allocate again.
+ void *freeAndAllocate(size_t Size, void *HstPtr);
+
+ /// The goal is to allocate memory on the device. It first tries to allocate
+ /// directly on the device. If a \p nullptr is returned, it might be because
+ /// the device is OOM. In that case, it will free all unused memory and then
+ /// try again.
+ void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr);
+
+public:
+ /// Constructor. If \p Threshold is non-zero, then the default threshold will
+ /// be overwritten by \p Threshold.
+ MemoryManagerTy(DeviceTy &Dev, size_t Threshold = 0);
+
+ /// Destructor
+ ~MemoryManagerTy();
+
+ /// Allocate memory of size \p Size from target device. \p HstPtr is used to
+ /// assist the allocation.
+ void *allocate(size_t Size, void *HstPtr);
+
+ /// Deallocate memory pointed by \p TgtPtr
+ int free(void *TgtPtr);
+};
+
+#endif // LLVM_OPENMP_LIBOMPTARGET_SRC_MEMORYMANAGER_H
//===----------------------------------------------------------------------===//
#include "device.h"
+#include "MemoryManager.h"
#include "private.h"
#include "rtl.h"
/// Map between Device ID (i.e. openmp device id) and its DeviceTy.
DevicesTy Devices;
+DeviceTy::DeviceTy(const DeviceTy &D)
+ : DeviceID(D.DeviceID), RTL(D.RTL), RTLDeviceID(D.RTLDeviceID),
+ IsInit(D.IsInit), InitFlag(), HasPendingGlobals(D.HasPendingGlobals),
+ HostDataToTargetMap(D.HostDataToTargetMap),
+ PendingCtorsDtors(D.PendingCtorsDtors), ShadowPtrMap(D.ShadowPtrMap),
+ DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(),
+ LoopTripCnt(D.LoopTripCnt), MemoryManager(nullptr) {}
+
+DeviceTy &DeviceTy::operator=(const DeviceTy &D) {
+ DeviceID = D.DeviceID;
+ RTL = D.RTL;
+ RTLDeviceID = D.RTLDeviceID;
+ IsInit = D.IsInit;
+ HasPendingGlobals = D.HasPendingGlobals;
+ HostDataToTargetMap = D.HostDataToTargetMap;
+ PendingCtorsDtors = D.PendingCtorsDtors;
+ ShadowPtrMap = D.ShadowPtrMap;
+ LoopTripCnt = D.LoopTripCnt;
+
+ return *this;
+}
+
+DeviceTy::DeviceTy(RTLInfoTy *RTL)
+ : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(),
+ HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(),
+ ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(),
+ MemoryManager(nullptr) {}
+
+DeviceTy::~DeviceTy() = default;
+
int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) {
DataMapMtx.lock();
// Make call to init_requires if it exists for this plugin.
if (RTL->init_requires)
RTL->init_requires(RTLs->RequiresFlags);
- int32_t rc = RTL->init_device(RTLDeviceID);
- if (rc == OFFLOAD_SUCCESS) {
- IsInit = true;
- }
+ int32_t Ret = RTL->init_device(RTLDeviceID);
+ if (Ret != OFFLOAD_SUCCESS)
+ return;
+
+ // The memory manager will only be disabled when users provide a threshold via
+ // the environment variable \p LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD and set
+ // it to 0.
+ if (const char *Env = std::getenv("LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD")) {
+ size_t Threshold = std::stoul(Env);
+ if (Threshold)
+ MemoryManager = std::make_unique<MemoryManagerTy>(*this, Threshold);
+ } else
+ MemoryManager = std::make_unique<MemoryManagerTy>(*this);
+
+ IsInit = true;
}
/// Thread-safe method to initialize the device only once.
}
void *DeviceTy::allocData(int64_t Size, void *HstPtr) {
+ // If memory manager is enabled, we will allocate data via memory manager.
+ if (MemoryManager)
+ return MemoryManager->allocate(Size, HstPtr);
+
return RTL->data_alloc(RTLDeviceID, Size, HstPtr);
}
int32_t DeviceTy::deleteData(void *TgtPtrBegin) {
+ // If memory manager is enabled, we will deallocate data via memory manager.
+ if (MemoryManager)
+ return MemoryManager->free(TgtPtrBegin);
+
return RTL->data_delete(RTLDeviceID, TgtPtrBegin);
}
#include <cstddef>
#include <list>
#include <map>
+#include <memory>
#include <mutex>
#include <set>
#include <vector>
struct __tgt_bin_desc;
struct __tgt_target_table;
struct __tgt_async_info;
+class MemoryManagerTy;
/// Map between host data and target data.
struct HostDataToTargetTy {
// moved into the target task in libomp.
std::map<int32_t, uint64_t> LoopTripCnt;
- DeviceTy(RTLInfoTy *RTL)
- : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(),
- HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(),
- ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx() {}
+ /// Memory manager
+ std::unique_ptr<MemoryManagerTy> MemoryManager;
+
+ DeviceTy(RTLInfoTy *RTL);
// The existence of mutexes makes DeviceTy non-copyable. We need to
// provide a copy constructor and an assignment operator explicitly.
- DeviceTy(const DeviceTy &d)
- : DeviceID(d.DeviceID), RTL(d.RTL), RTLDeviceID(d.RTLDeviceID),
- IsInit(d.IsInit), InitFlag(), HasPendingGlobals(d.HasPendingGlobals),
- HostDataToTargetMap(d.HostDataToTargetMap),
- PendingCtorsDtors(d.PendingCtorsDtors), ShadowPtrMap(d.ShadowPtrMap),
- DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(),
- LoopTripCnt(d.LoopTripCnt) {}
-
- DeviceTy& operator=(const DeviceTy &d) {
- DeviceID = d.DeviceID;
- RTL = d.RTL;
- RTLDeviceID = d.RTLDeviceID;
- IsInit = d.IsInit;
- HasPendingGlobals = d.HasPendingGlobals;
- HostDataToTargetMap = d.HostDataToTargetMap;
- PendingCtorsDtors = d.PendingCtorsDtors;
- ShadowPtrMap = d.ShadowPtrMap;
- LoopTripCnt = d.LoopTripCnt;
-
- return *this;
- }
+ DeviceTy(const DeviceTy &D);
+
+ DeviceTy &operator=(const DeviceTy &D);
+
+ ~DeviceTy();
// Return true if data can be copied to DstDevice directly
bool isDataExchangable(const DeviceTy& DstDevice);
--- /dev/null
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
+
+#include <omp.h>
+
+#include <cassert>
+#include <iostream>
+
+int main(int argc, char *argv[]) {
+#pragma omp parallel for
+ for (int i = 0; i < 16; ++i) {
+ for (int n = 1; n < (1 << 13); n <<= 1) {
+ void *p = omp_target_alloc(n * sizeof(int), 0);
+ omp_target_free(p, 0);
+ }
+ }
+
+#pragma omp parallel for
+ for (int i = 0; i < 16; ++i) {
+ for (int n = 1; n < (1 << 13); n <<= 1) {
+ int *p = (int *)omp_target_alloc(n * sizeof(int), 0);
+#pragma omp target teams distribute parallel for is_device_ptr(p)
+ for (int j = 0; j < n; ++j) {
+ p[j] = i;
+ }
+ int buffer[n];
+#pragma omp target teams distribute parallel for is_device_ptr(p) \
+ map(from \
+ : buffer)
+ for (int j = 0; j < n; ++j) {
+ buffer[j] = p[j];
+ }
+ for (int j = 0; j < n; ++j) {
+ assert(buffer[j] == i);
+ }
+ omp_target_free(p, 0);
+ }
+ }
+
+ std::cout << "PASS\n";
+ return 0;
+}
+
+// CHECK: PASS