/******************************************************************************/
#include "GPUJIT.h"
-
-#ifdef HAS_LIBCUDART
#include <cuda.h>
#include <cuda_runtime.h>
-#endif /* HAS_LIBCUDART */
-
-#ifdef HAS_LIBOPENCL
-#ifdef __APPLE__
-#include <OpenCL/opencl.h>
-#else
-#include <CL/cl.h>
-#endif
-#endif /* HAS_LIBOPENCL */
-
#include <dlfcn.h>
#include <stdarg.h>
#include <stdio.h>
static int DebugMode;
static int CacheMode;
-static PollyGPURuntime Runtime = RUNTIME_NONE;
-
static void debug_print(const char *format, ...) {
if (!DebugMode)
return;
}
#define dump_function() debug_print("-> %s\n", __func__)
-#define KERNEL_CACHE_SIZE 10
-
-static void err_runtime() {
- fprintf(stderr, "Runtime not correctly initialized.\n");
- exit(-1);
-}
-
+/* Define Polly's GPGPU data types. */
struct PollyGPUContextT {
- void *Context;
-};
-
-struct PollyGPUFunctionT {
- void *Kernel;
-};
-
-struct PollyGPUDevicePtrT {
- void *DevicePtr;
-};
-
-/******************************************************************************/
-/* OpenCL */
-/******************************************************************************/
-#ifdef HAS_LIBOPENCL
-
-struct OpenCLContextT {
- cl_context Context;
- cl_command_queue CommandQueue;
-};
-
-struct OpenCLKernelT {
- cl_kernel Kernel;
- cl_program Program;
- const char *BinaryString;
-};
-
-struct OpenCLDevicePtrT {
- cl_mem MemObj;
-};
-
-/* Dynamic library handles for the OpenCL runtime library. */
-static void *HandleOpenCL;
-
-/* Type-defines of function pointer to OpenCL Runtime API. */
-typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
- cl_platform_id *Platforms,
- cl_uint *NumPlatforms);
-static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr;
-
-typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform,
- cl_device_type DeviceType,
- cl_uint NumEntries, cl_device_id *Devices,
- cl_uint *NumDevices);
-static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr;
-
-typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device,
- cl_device_info ParamName,
- size_t ParamValueSize, void *ParamValue,
- size_t *ParamValueSizeRet);
-static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr;
-
-typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName,
- size_t ParamValueSize, void *ParamValue,
- size_t *ParamValueSizeRet);
-static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr;
-
-typedef cl_context clCreateContextFcnTy(
- const cl_context_properties *Properties, cl_uint NumDevices,
- const cl_device_id *Devices,
- void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo,
- size_t CB, void *UserData),
- void *UserData, cl_int *ErrcodeRet);
-static clCreateContextFcnTy *clCreateContextFcnPtr;
-
-typedef cl_command_queue
-clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device,
- cl_command_queue_properties Properties,
- cl_int *ErrcodeRet);
-static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr;
-
-typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags,
- size_t Size, void *HostPtr,
- cl_int *ErrcodeRet);
-static clCreateBufferFcnTy *clCreateBufferFcnPtr;
-
-typedef cl_int
-clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
- cl_bool BlockingWrite, size_t Offset, size_t Size,
- const void *Ptr, cl_uint NumEventsInWaitList,
- const cl_event *EventWaitList, cl_event *Event);
-static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
-
-typedef cl_program clCreateProgramWithBinaryFcnTy(
- cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
- const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
- cl_int *ErrcodeRet);
-static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr;
-
-typedef cl_int clBuildProgramFcnTy(
- cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList,
- const char *Options,
- void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData),
- void *UserData);
-static clBuildProgramFcnTy *clBuildProgramFcnPtr;
-
-typedef cl_kernel clCreateKernelFcnTy(cl_program Program,
- const char *KernelName,
- cl_int *ErrcodeRet);
-static clCreateKernelFcnTy *clCreateKernelFcnPtr;
-
-typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex,
- size_t ArgSize, const void *ArgValue);
-static clSetKernelArgFcnTy *clSetKernelArgFcnPtr;
-
-typedef cl_int clEnqueueNDRangeKernelFcnTy(
- cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim,
- const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
- const size_t *LocalWorkSize, cl_uint NumEventsInWaitList,
- const cl_event *EventWaitList, cl_event *Event);
-static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr;
-
-typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue,
- cl_mem Buffer, cl_bool BlockingRead,
- size_t Offset, size_t Size, void *Ptr,
- cl_uint NumEventsInWaitList,
- const cl_event *EventWaitList,
- cl_event *Event);
-static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr;
-
-typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue);
-static clFlushFcnTy *clFlushFcnPtr;
-
-typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue);
-static clFinishFcnTy *clFinishFcnPtr;
-
-typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel);
-static clReleaseKernelFcnTy *clReleaseKernelFcnPtr;
-
-typedef cl_int clReleaseProgramFcnTy(cl_program Program);
-static clReleaseProgramFcnTy *clReleaseProgramFcnPtr;
-
-typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject);
-static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr;
-
-typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue);
-static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr;
-
-typedef cl_int clReleaseContextFcnTy(cl_context Context);
-static clReleaseContextFcnTy *clReleaseContextFcnPtr;
-
-static void *getAPIHandleCL(void *Handle, const char *FuncName) {
- char *Err;
- void *FuncPtr;
- dlerror();
- FuncPtr = dlsym(Handle, FuncName);
- if ((Err = dlerror()) != 0) {
- fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err);
- return 0;
- }
- return FuncPtr;
-}
-
-static int initialDeviceAPILibrariesCL() {
- HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
- if (!HandleOpenCL) {
- fprintf(stderr, "Cannot open library: %s. \n", dlerror());
- return 0;
- }
- return 1;
-}
-
-static int initialDeviceAPIsCL() {
- if (initialDeviceAPILibrariesCL() == 0)
- return 0;
-
- /* Get function pointer to OpenCL Runtime API.
- *
- * Note that compilers conforming to the ISO C standard are required to
- * generate a warning if a conversion from a void * pointer to a function
- * pointer is attempted as in the following statements. The warning
- * of this kind of cast may not be emitted by clang and new versions of gcc
- * as it is valid on POSIX 2008.
- */
- clGetPlatformIDsFcnPtr =
- (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetPlatformIDs");
-
- clGetDeviceIDsFcnPtr =
- (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceIDs");
-
- clGetDeviceInfoFcnPtr =
- (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceInfo");
-
- clGetKernelInfoFcnPtr =
- (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetKernelInfo");
-
- clCreateContextFcnPtr =
- (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateContext");
-
- clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
- HandleOpenCL, "clCreateCommandQueue");
-
- clCreateBufferFcnPtr =
- (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateBuffer");
-
- clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
- HandleOpenCL, "clEnqueueWriteBuffer");
-
- clCreateProgramWithBinaryFcnPtr =
- (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
- HandleOpenCL, "clCreateProgramWithBinary");
-
- clBuildProgramFcnPtr =
- (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clBuildProgram");
-
- clCreateKernelFcnPtr =
- (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateKernel");
-
- clSetKernelArgFcnPtr =
- (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL, "clSetKernelArg");
-
- clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
- HandleOpenCL, "clEnqueueNDRangeKernel");
-
- clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy *)getAPIHandleCL(
- HandleOpenCL, "clEnqueueReadBuffer");
-
- clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL, "clFlush");
-
- clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL, "clFinish");
-
- clReleaseKernelFcnPtr =
- (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseKernel");
-
- clReleaseProgramFcnPtr =
- (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseProgram");
-
- clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL(
- HandleOpenCL, "clReleaseMemObject");
-
- clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
- HandleOpenCL, "clReleaseCommandQueue");
-
- clReleaseContextFcnPtr =
- (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseContext");
-
- return 1;
-}
-
-/* Context and Device. */
-static PollyGPUContext *GlobalContext = NULL;
-static cl_device_id GlobalDeviceID = NULL;
-
-/* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
-static void printOpenCLError(int Error);
-
-static void checkOpenCLError(int Ret, const char *format, ...) {
- if (Ret == CL_SUCCESS)
- return;
-
- printOpenCLError(Ret);
- va_list args;
- va_start(args, format);
- vfprintf(stderr, format, args);
- va_end(args);
- exit(-1);
-}
-
-static PollyGPUContext *initContextCL() {
- dump_function();
-
- PollyGPUContext *Context;
-
- cl_platform_id PlatformID = NULL;
- cl_device_id DeviceID = NULL;
- cl_uint NumDevicesRet;
- cl_int Ret;
-
- char DeviceRevision[256];
- char DeviceName[256];
- size_t DeviceRevisionRetSize, DeviceNameRetSize;
-
- static __thread PollyGPUContext *CurrentContext = NULL;
-
- if (CurrentContext)
- return CurrentContext;
-
- /* Get API handles. */
- if (initialDeviceAPIsCL() == 0) {
- fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
- exit(-1);
- }
-
- /* Get number of devices that support OpenCL. */
- static const int NumberOfPlatforms = 1;
- Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL);
- checkOpenCLError(Ret, "Failed to get platform IDs.\n");
- // TODO: Extend to CL_DEVICE_TYPE_ALL?
- static const int NumberOfDevices = 1;
- Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices,
- &DeviceID, &NumDevicesRet);
- checkOpenCLError(Ret, "Failed to get device IDs.\n");
-
- GlobalDeviceID = DeviceID;
- if (NumDevicesRet == 0) {
- fprintf(stderr, "There is no device supporting OpenCL.\n");
- exit(-1);
- }
-
- /* Get device revision. */
- Ret =
- clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision),
- DeviceRevision, &DeviceRevisionRetSize);
- checkOpenCLError(Ret, "Failed to fetch device revision.\n");
-
- /* Get device name. */
- Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName),
- DeviceName, &DeviceNameRetSize);
- checkOpenCLError(Ret, "Failed to fetch device name.\n");
-
- debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
-
- /* Create context on the device. */
- Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
- if (Context == 0) {
- fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
- exit(-1);
- }
- Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext));
- if (Context->Context == 0) {
- fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n");
- exit(-1);
- }
- ((OpenCLContext *)Context->Context)->Context =
- clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret);
- checkOpenCLError(Ret, "Failed to create context.\n");
-
- static const int ExtraProperties = 0;
- ((OpenCLContext *)Context->Context)->CommandQueue =
- clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context,
- DeviceID, ExtraProperties, &Ret);
- checkOpenCLError(Ret, "Failed to create command queue.\n");
-
- if (CacheMode)
- CurrentContext = Context;
-
- GlobalContext = Context;
- return Context;
-}
-
-static void freeKernelCL(PollyGPUFunction *Kernel) {
- dump_function();
-
- if (CacheMode)
- return;
-
- if (!GlobalContext) {
- fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
- exit(-1);
- }
-
- cl_int Ret;
- Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
- checkOpenCLError(Ret, "Failed to flush command queue.\n");
- Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
- checkOpenCLError(Ret, "Failed to finish command queue.\n");
-
- if (((OpenCLKernel *)Kernel->Kernel)->Kernel) {
- cl_int Ret =
- clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel);
- checkOpenCLError(Ret, "Failed to release kernel.\n");
- }
-
- if (((OpenCLKernel *)Kernel->Kernel)->Program) {
- cl_int Ret =
- clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program);
- checkOpenCLError(Ret, "Failed to release program.\n");
- }
-
- if (Kernel->Kernel)
- free((OpenCLKernel *)Kernel->Kernel);
-
- if (Kernel)
- free(Kernel);
-}
-
-static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
- const char *KernelName) {
- dump_function();
-
- if (!GlobalContext) {
- fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
- exit(-1);
- }
-
- static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
- static __thread int NextCacheItem = 0;
-
- for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
- // We exploit here the property that all Polly-ACC kernels are allocated
- // as global constants, hence a pointer comparision is sufficient to
- // determin equality.
- if (KernelCache[i] &&
- ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString ==
- BinaryBuffer) {
- debug_print(" -> using cached kernel\n");
- return KernelCache[i];
- }
- }
-
- PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
- if (Function == 0) {
- fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
- exit(-1);
- }
- Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel));
- if (Function->Kernel == 0) {
- fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n");
- exit(-1);
- }
-
- if (!GlobalDeviceID) {
- fprintf(stderr, "GPGPU-code generation not initialized correctly.\n");
- exit(-1);
- }
-
- cl_int Ret;
- size_t BinarySize = strlen(BinaryBuffer);
- ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr(
- ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID,
- (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL,
- &Ret);
- checkOpenCLError(Ret, "Failed to create program from binary.\n");
-
- Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
- &GlobalDeviceID, NULL, NULL, NULL);
- checkOpenCLError(Ret, "Failed to build program.\n");
-
- ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr(
- ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret);
- checkOpenCLError(Ret, "Failed to create kernel.\n");
-
- ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
-
- if (CacheMode) {
- if (KernelCache[NextCacheItem])
- freeKernelCL(KernelCache[NextCacheItem]);
-
- KernelCache[NextCacheItem] = Function;
-
- NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
- }
-
- return Function;
-}
-
-static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData,
- long MemSize) {
- dump_function();
-
- if (!GlobalContext) {
- fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
- exit(-1);
- }
-
- cl_int Ret;
- Ret = clEnqueueWriteBufferFcnPtr(
- ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
- ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
- HostData, 0, NULL, NULL);
- checkOpenCLError(Ret, "Copying data from host memory to device failed.\n");
-}
-
-static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData,
- long MemSize) {
- dump_function();
-
- if (!GlobalContext) {
- fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
- exit(-1);
- }
-
- cl_int Ret;
- Ret = clEnqueueReadBufferFcnPtr(
- ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
- ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
- HostData, 0, NULL, NULL);
- checkOpenCLError(Ret, "Copying results from device to host memory failed.\n");
-}
-
-static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
- unsigned int GridDimY, unsigned int BlockDimX,
- unsigned int BlockDimY, unsigned int BlockDimZ,
- void **Parameters) {
- dump_function();
-
- cl_int Ret;
- cl_uint NumArgs;
-
- if (!GlobalContext) {
- fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
- exit(-1);
- }
-
- OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel;
- Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
- sizeof(cl_uint), &NumArgs, NULL);
- checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");
-
- // TODO: Pass the size of the kernel arguments in to launchKernelCL, along
- // with the arguments themselves. This is a dirty workaround that can be
- // broken.
- for (cl_uint i = 0; i < NumArgs; i++) {
- Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 8, (void *)Parameters[i]);
- if (Ret == CL_INVALID_ARG_SIZE) {
- Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 4, (void *)Parameters[i]);
- if (Ret == CL_INVALID_ARG_SIZE) {
- Ret =
- clSetKernelArgFcnPtr(CLKernel->Kernel, i, 2, (void *)Parameters[i]);
- if (Ret == CL_INVALID_ARG_SIZE) {
- Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 1,
- (void *)Parameters[i]);
- checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
- }
- }
- }
- if (Ret != CL_SUCCESS && Ret != CL_INVALID_ARG_SIZE) {
- fprintf(stderr, "Failed to set Kernel argument.\n");
- printOpenCLError(Ret);
- exit(-1);
- }
- }
-
- unsigned int GridDimZ = 1;
- size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,
- BlockDimZ * GridDimZ};
- size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};
-
- static const int WorkDim = 3;
- OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
- Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel,
- WorkDim, NULL, GlobalWorkSize,
- LocalWorkSize, 0, NULL, NULL);
- checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
-}
-
-static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) {
- dump_function();
-
- OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
- cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj);
- checkOpenCLError(Ret, "Failed to free device memory.\n");
-
- free(DevPtr);
- free(Allocation);
-}
-
-static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) {
- dump_function();
-
- if (!GlobalContext) {
- fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
- exit(-1);
- }
-
- PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
- if (DevData == 0) {
- fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
- exit(-1);
- }
- DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr));
- if (DevData->DevicePtr == 0) {
- fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
- exit(-1);
- }
-
- cl_int Ret;
- ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj =
- clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context,
- CL_MEM_READ_WRITE, MemSize, NULL, &Ret);
- checkOpenCLError(Ret,
- "Allocate memory for GPU device memory pointer failed.\n");
-
- return DevData;
-}
-
-static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) {
- dump_function();
-
- OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
- return (void *)DevPtr->MemObj;
-}
-
-static void synchronizeDeviceCL() {
- dump_function();
-
- if (!GlobalContext) {
- fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
- exit(-1);
- }
-
- if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) !=
- CL_SUCCESS) {
- fprintf(stderr, "Synchronizing device and host memory failed.\n");
- exit(-1);
- }
-}
-
-static void freeContextCL(PollyGPUContext *Context) {
- dump_function();
-
- cl_int Ret;
-
- GlobalContext = NULL;
-
- OpenCLContext *Ctx = (OpenCLContext *)Context->Context;
- if (Ctx->CommandQueue) {
- Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue);
- checkOpenCLError(Ret, "Could not release command queue.\n");
- }
-
- if (Ctx->Context) {
- Ret = clReleaseContextFcnPtr(Ctx->Context);
- checkOpenCLError(Ret, "Could not release context.\n");
- }
-
- free(Ctx);
- free(Context);
-}
-
-static void printOpenCLError(int Error) {
-
- switch (Error) {
- case CL_SUCCESS:
- // Success, don't print an error.
- break;
-
- // JIT/Runtime errors.
- case CL_DEVICE_NOT_FOUND:
- fprintf(stderr, "Device not found.\n");
- break;
- case CL_DEVICE_NOT_AVAILABLE:
- fprintf(stderr, "Device not available.\n");
- break;
- case CL_COMPILER_NOT_AVAILABLE:
- fprintf(stderr, "Compiler not available.\n");
- break;
- case CL_MEM_OBJECT_ALLOCATION_FAILURE:
- fprintf(stderr, "Mem object allocation failure.\n");
- break;
- case CL_OUT_OF_RESOURCES:
- fprintf(stderr, "Out of resources.\n");
- break;
- case CL_OUT_OF_HOST_MEMORY:
- fprintf(stderr, "Out of host memory.\n");
- break;
- case CL_PROFILING_INFO_NOT_AVAILABLE:
- fprintf(stderr, "Profiling info not available.\n");
- break;
- case CL_MEM_COPY_OVERLAP:
- fprintf(stderr, "Mem copy overlap.\n");
- break;
- case CL_IMAGE_FORMAT_MISMATCH:
- fprintf(stderr, "Image format mismatch.\n");
- break;
- case CL_IMAGE_FORMAT_NOT_SUPPORTED:
- fprintf(stderr, "Image format not supported.\n");
- break;
- case CL_BUILD_PROGRAM_FAILURE:
- fprintf(stderr, "Build program failure.\n");
- break;
- case CL_MAP_FAILURE:
- fprintf(stderr, "Map failure.\n");
- break;
- case CL_MISALIGNED_SUB_BUFFER_OFFSET:
- fprintf(stderr, "Misaligned sub buffer offset.\n");
- break;
- case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
- fprintf(stderr, "Exec status error for events in wait list.\n");
- break;
- case CL_COMPILE_PROGRAM_FAILURE:
- fprintf(stderr, "Compile program failure.\n");
- break;
- case CL_LINKER_NOT_AVAILABLE:
- fprintf(stderr, "Linker not available.\n");
- break;
- case CL_LINK_PROGRAM_FAILURE:
- fprintf(stderr, "Link program failure.\n");
- break;
- case CL_DEVICE_PARTITION_FAILED:
- fprintf(stderr, "Device partition failed.\n");
- break;
- case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
- fprintf(stderr, "Kernel arg info not available.\n");
- break;
-
- // Compiler errors.
- case CL_INVALID_VALUE:
- fprintf(stderr, "Invalid value.\n");
- break;
- case CL_INVALID_DEVICE_TYPE:
- fprintf(stderr, "Invalid device type.\n");
- break;
- case CL_INVALID_PLATFORM:
- fprintf(stderr, "Invalid platform.\n");
- break;
- case CL_INVALID_DEVICE:
- fprintf(stderr, "Invalid device.\n");
- break;
- case CL_INVALID_CONTEXT:
- fprintf(stderr, "Invalid context.\n");
- break;
- case CL_INVALID_QUEUE_PROPERTIES:
- fprintf(stderr, "Invalid queue properties.\n");
- break;
- case CL_INVALID_COMMAND_QUEUE:
- fprintf(stderr, "Invalid command queue.\n");
- break;
- case CL_INVALID_HOST_PTR:
- fprintf(stderr, "Invalid host pointer.\n");
- break;
- case CL_INVALID_MEM_OBJECT:
- fprintf(stderr, "Invalid memory object.\n");
- break;
- case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
- fprintf(stderr, "Invalid image format descriptor.\n");
- break;
- case CL_INVALID_IMAGE_SIZE:
- fprintf(stderr, "Invalid image size.\n");
- break;
- case CL_INVALID_SAMPLER:
- fprintf(stderr, "Invalid sampler.\n");
- break;
- case CL_INVALID_BINARY:
- fprintf(stderr, "Invalid binary.\n");
- break;
- case CL_INVALID_BUILD_OPTIONS:
- fprintf(stderr, "Invalid build options.\n");
- break;
- case CL_INVALID_PROGRAM:
- fprintf(stderr, "Invalid program.\n");
- break;
- case CL_INVALID_PROGRAM_EXECUTABLE:
- fprintf(stderr, "Invalid program executable.\n");
- break;
- case CL_INVALID_KERNEL_NAME:
- fprintf(stderr, "Invalid kernel name.\n");
- break;
- case CL_INVALID_KERNEL_DEFINITION:
- fprintf(stderr, "Invalid kernel definition.\n");
- break;
- case CL_INVALID_KERNEL:
- fprintf(stderr, "Invalid kernel.\n");
- break;
- case CL_INVALID_ARG_INDEX:
- fprintf(stderr, "Invalid arg index.\n");
- break;
- case CL_INVALID_ARG_VALUE:
- fprintf(stderr, "Invalid arg value.\n");
- break;
- case CL_INVALID_ARG_SIZE:
- fprintf(stderr, "Invalid arg size.\n");
- break;
- case CL_INVALID_KERNEL_ARGS:
- fprintf(stderr, "Invalid kernel args.\n");
- break;
- case CL_INVALID_WORK_DIMENSION:
- fprintf(stderr, "Invalid work dimension.\n");
- break;
- case CL_INVALID_WORK_GROUP_SIZE:
- fprintf(stderr, "Invalid work group size.\n");
- break;
- case CL_INVALID_WORK_ITEM_SIZE:
- fprintf(stderr, "Invalid work item size.\n");
- break;
- case CL_INVALID_GLOBAL_OFFSET:
- fprintf(stderr, "Invalid global offset.\n");
- break;
- case CL_INVALID_EVENT_WAIT_LIST:
- fprintf(stderr, "Invalid event wait list.\n");
- break;
- case CL_INVALID_EVENT:
- fprintf(stderr, "Invalid event.\n");
- break;
- case CL_INVALID_OPERATION:
- fprintf(stderr, "Invalid operation.\n");
- break;
- case CL_INVALID_GL_OBJECT:
- fprintf(stderr, "Invalid GL object.\n");
- break;
- case CL_INVALID_BUFFER_SIZE:
- fprintf(stderr, "Invalid buffer size.\n");
- break;
- case CL_INVALID_MIP_LEVEL:
- fprintf(stderr, "Invalid mip level.\n");
- break;
- case CL_INVALID_GLOBAL_WORK_SIZE:
- fprintf(stderr, "Invalid global work size.\n");
- break;
- case CL_INVALID_PROPERTY:
- fprintf(stderr, "Invalid property.\n");
- break;
- case CL_INVALID_IMAGE_DESCRIPTOR:
- fprintf(stderr, "Invalid image descriptor.\n");
- break;
- case CL_INVALID_COMPILER_OPTIONS:
- fprintf(stderr, "Invalid compiler options.\n");
- break;
- case CL_INVALID_LINKER_OPTIONS:
- fprintf(stderr, "Invalid linker options.\n");
- break;
- case CL_INVALID_DEVICE_PARTITION_COUNT:
- fprintf(stderr, "Invalid device partition count.\n");
- break;
- case CL_INVALID_PIPE_SIZE:
- fprintf(stderr, "Invalid pipe size.\n");
- break;
- case CL_INVALID_DEVICE_QUEUE:
- fprintf(stderr, "Invalid device queue.\n");
- break;
-
- // NVIDIA specific error.
- case -9999:
- fprintf(stderr, "NVIDIA invalid read or write buffer.\n");
- break;
-
- default:
- fprintf(stderr, "Unknown error code!\n");
- break;
- }
-}
-
-#endif /* HAS_LIBOPENCL */
-/******************************************************************************/
-/* CUDA */
-/******************************************************************************/
-#ifdef HAS_LIBCUDART
-
-struct CUDAContextT {
CUcontext Cuda;
};
-struct CUDAKernelT {
+struct PollyGPUFunctionT {
CUfunction Cuda;
CUmodule CudaModule;
- const char *BinaryString;
+ const char *PTXString;
};
-struct CUDADevicePtrT {
+struct PollyGPUDevicePtrT {
CUdeviceptr Cuda;
};
static CuMemAllocFcnTy *CuMemAllocFcnPtr;
typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
- CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
- unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY,
- unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream,
- void **KernelParams, void **Extra);
+ CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+ unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
+ unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
+ void **kernelParams, void **extra);
static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
void **);
static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
-typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module,
- const void *Image);
+typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module,
+ const void *image);
static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
-typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State,
- CUjitInputType Type, void *Data,
- size_t Size, const char *Name,
- unsigned int NumOptions,
- CUjit_option *Options,
- void **OptionValues);
+typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state,
+ CUjitInputType type, void *data,
+ size_t size, const char *name,
+ unsigned int numOptions,
+ CUjit_option *options,
+ void **optionValues);
static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
-typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions,
- CUjit_option *Options,
- void **OptionValues,
- CUlinkState *StateOut);
+typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions,
+ CUjit_option *options,
+ void **optionValues,
+ CUlinkState *stateOut);
static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
-typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut,
- size_t *SizeOut);
+typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut,
+ size_t *sizeOut);
static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
-typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State);
+typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
-static void *getAPIHandleCUDA(void *Handle, const char *FuncName) {
+static void *getAPIHandle(void *Handle, const char *FuncName) {
char *Err;
void *FuncPtr;
dlerror();
FuncPtr = dlsym(Handle, FuncName);
if ((Err = dlerror()) != 0) {
- fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err);
+ fprintf(stdout, "Load CUDA driver API failed: %s. \n", Err);
return 0;
}
return FuncPtr;
}
-static int initialDeviceAPILibrariesCUDA() {
+static int initialDeviceAPILibraries() {
HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
if (!HandleCuda) {
- fprintf(stderr, "Cannot open library: %s. \n", dlerror());
+ printf("Cannot open library: %s. \n", dlerror());
return 0;
}
HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
if (!HandleCudaRT) {
- fprintf(stderr, "Cannot open library: %s. \n", dlerror());
+ printf("Cannot open library: %s. \n", dlerror());
return 0;
}
return 1;
}
-static int initialDeviceAPIsCUDA() {
- if (initialDeviceAPILibrariesCUDA() == 0)
+static int initialDeviceAPIs() {
+ if (initialDeviceAPILibraries() == 0)
return 0;
/* Get function pointer to CUDA Driver APIs.
* as it is valid on POSIX 2008.
*/
CuLaunchKernelFcnPtr =
- (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel");
+ (CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel");
CuMemAllocFcnPtr =
- (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");
+ (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2");
- CuMemFreeFcnPtr =
- (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");
+ CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda, "cuMemFree_v2");
CuMemcpyDtoHFcnPtr =
- (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2");
+ (CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2");
CuMemcpyHtoDFcnPtr =
- (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2");
+ (CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2");
CuModuleUnloadFcnPtr =
- (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload");
+ (CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload");
CuCtxDestroyFcnPtr =
- (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy");
+ (CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy");
- CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit");
+ CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit");
CuDeviceGetCountFcnPtr =
- (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount");
+ (CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetCount");
CuDeviceGetFcnPtr =
- (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet");
+ (CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet");
CuCtxCreateFcnPtr =
- (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2");
+ (CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2");
- CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA(
- HandleCuda, "cuModuleLoadDataEx");
+ CuModuleLoadDataExFcnPtr =
+ (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx");
CuModuleLoadDataFcnPtr =
- (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData");
+ (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData");
- CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA(
+ CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
HandleCuda, "cuModuleGetFunction");
CuDeviceComputeCapabilityFcnPtr =
- (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA(
+ (CuDeviceComputeCapabilityFcnTy *)getAPIHandle(
HandleCuda, "cuDeviceComputeCapability");
CuDeviceGetNameFcnPtr =
- (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName");
+ (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName");
CuLinkAddDataFcnPtr =
- (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData");
+ (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData");
CuLinkCreateFcnPtr =
- (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate");
+ (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate");
CuLinkCompleteFcnPtr =
- (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete");
+ (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete");
CuLinkDestroyFcnPtr =
- (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy");
+ (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");
CuCtxSynchronizeFcnPtr =
- (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize");
+ (CuCtxSynchronizeFcnTy *)getAPIHandle(HandleCuda, "cuCtxSynchronize");
/* Get function pointer to CUDA Runtime APIs. */
- CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA(
+ CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
HandleCudaRT, "cudaThreadSynchronize");
return 1;
}
-static PollyGPUContext *initContextCUDA() {
+PollyGPUContext *polly_initContext() {
+ DebugMode = getenv("POLLY_DEBUG") != 0;
+
dump_function();
PollyGPUContext *Context;
CUdevice Device;
return CurrentContext;
/* Get API handles. */
- if (initialDeviceAPIsCUDA() == 0) {
- fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n");
+ if (initialDeviceAPIs() == 0) {
+ fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n");
exit(-1);
}
if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
- fprintf(stderr, "Initializing the CUDA driver API failed.\n");
+ fprintf(stdout, "Initializing the CUDA driver API failed.\n");
exit(-1);
}
/* Get number of devices that supports CUDA. */
CuDeviceGetCountFcnPtr(&DeviceCount);
if (DeviceCount == 0) {
- fprintf(stderr, "There is no device supporting CUDA.\n");
+ fprintf(stdout, "There is no device supporting CUDA.\n");
exit(-1);
}
/* Create context on the device. */
Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
if (Context == 0) {
- fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
+ fprintf(stdout, "Allocate memory for Polly GPU context failed.\n");
exit(-1);
}
- Context->Context = malloc(sizeof(CUDAContext));
- if (Context->Context == 0) {
- fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
- exit(-1);
- }
- CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, Device);
+ CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device);
+
+ CacheMode = getenv("POLLY_NOCACHE") == 0;
if (CacheMode)
CurrentContext = Context;
return Context;
}
-static void freeKernelCUDA(PollyGPUFunction *Kernel) {
- dump_function();
-
- if (CacheMode)
- return;
-
- if (((CUDAKernel *)Kernel->Kernel)->CudaModule)
- CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule);
-
- if (Kernel->Kernel)
- free((CUDAKernel *)Kernel->Kernel);
+static void freeKernel(PollyGPUFunction *Kernel) {
+ if (Kernel->CudaModule)
+ CuModuleUnloadFcnPtr(Kernel->CudaModule);
if (Kernel)
free(Kernel);
}
-static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer,
- const char *KernelName) {
+#define KERNEL_CACHE_SIZE 10
+
+PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
+ const char *KernelName) {
dump_function();
static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
// We exploit here the property that all Polly-ACC kernels are allocated
// as global constants, hence a pointer comparision is sufficient to
// determin equality.
- if (KernelCache[i] &&
- ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) {
+ if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) {
debug_print(" -> using cached kernel\n");
return KernelCache[i];
}
}
PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
+
if (Function == 0) {
- fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
- exit(-1);
- }
- Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel));
- if (Function->Kernel == 0) {
- fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n");
+ fprintf(stdout, "Allocate memory for Polly GPU function failed.\n");
exit(-1);
}
memset(ErrorLog, 0, sizeof(ErrorLog));
CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
- Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer,
- strlen(BinaryBuffer) + 1, 0, 0, 0, 0);
+ Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer,
+ strlen(PTXBuffer) + 1, 0, 0, 0, 0);
if (Res != CUDA_SUCCESS) {
- fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
+ fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
exit(-1);
}
Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
if (Res != CUDA_SUCCESS) {
- fprintf(stderr, "Complete ptx linker step failed.\n");
- fprintf(stderr, "\n%s\n", ErrorLog);
+ fprintf(stdout, "Complete ptx linker step failed.\n");
+ fprintf(stdout, "\n%s\n", ErrorLog);
exit(-1);
}
debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
InfoLog);
- Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule),
- CuOut);
+ Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut);
if (Res != CUDA_SUCCESS) {
- fprintf(stderr, "Loading ptx assembly text failed.\n");
+ fprintf(stdout, "Loading ptx assembly text failed.\n");
exit(-1);
}
- Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda),
- ((CUDAKernel *)Function->Kernel)->CudaModule,
+ Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule,
KernelName);
if (Res != CUDA_SUCCESS) {
- fprintf(stderr, "Loading kernel function failed.\n");
+ fprintf(stdout, "Loading kernel function failed.\n");
exit(-1);
}
CuLinkDestroyFcnPtr(LState);
- ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
+ Function->PTXString = PTXBuffer;
if (CacheMode) {
if (KernelCache[NextCacheItem])
- freeKernelCUDA(KernelCache[NextCacheItem]);
+ freeKernel(KernelCache[NextCacheItem]);
KernelCache[NextCacheItem] = Function;
return Function;
}
-static void synchronizeDeviceCUDA() {
+void polly_freeKernel(PollyGPUFunction *Kernel) {
dump_function();
- if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
- fprintf(stderr, "Synchronizing device and host memory failed.\n");
- exit(-1);
- }
+
+ if (CacheMode)
+ return;
+
+ freeKernel(Kernel);
}
-static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData,
- long MemSize) {
+void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
+ long MemSize) {
dump_function();
- CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda;
+ CUdeviceptr CuDevData = DevData->Cuda;
CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
}
-static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData,
- long MemSize) {
+void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
+ long MemSize) {
dump_function();
- if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda,
- MemSize) != CUDA_SUCCESS) {
- fprintf(stderr, "Copying results from device to host memory failed.\n");
+ if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) {
+ fprintf(stdout, "Copying results from device to host memory failed.\n");
+ exit(-1);
+ }
+}
+void polly_synchronizeDevice() {
+ dump_function();
+ if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
+ fprintf(stdout, "Synchronizing device and host memory failed.\n");
exit(-1);
}
}
-static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX,
- unsigned int GridDimY, unsigned int BlockDimX,
- unsigned int BlockDimY, unsigned int BlockDimZ,
- void **Parameters) {
+void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
+ unsigned int GridDimY, unsigned int BlockDimX,
+ unsigned int BlockDimY, unsigned int BlockDimZ,
+ void **Parameters) {
dump_function();
unsigned GridDimZ = 1;
void **Extra = 0;
CUresult Res;
- Res =
- CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX,
- GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ,
- SharedMemBytes, Stream, Parameters, Extra);
+ Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ,
+ BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes,
+ Stream, Parameters, Extra);
if (Res != CUDA_SUCCESS) {
- fprintf(stderr, "Launching CUDA kernel failed.\n");
+ fprintf(stdout, "Launching CUDA kernel failed.\n");
exit(-1);
}
}
-static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
+void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
dump_function();
- CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
- CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda);
- free(DevPtr);
+ CuMemFreeFcnPtr((CUdeviceptr)Allocation->Cuda);
free(Allocation);
}
-static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) {
+PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
dump_function();
PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
+
if (DevData == 0) {
- fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
- exit(-1);
- }
- DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr));
- if (DevData->DevicePtr == 0) {
- fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
+ fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n");
exit(-1);
}
- CUresult Res =
- CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize);
+ CUresult Res = CuMemAllocFcnPtr(&(DevData->Cuda), MemSize);
if (Res != CUDA_SUCCESS) {
- fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
+ fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n");
exit(-1);
}
return DevData;
}
-static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) {
- dump_function();
-
- CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
- return (void *)DevPtr->Cuda;
-}
-
-static void freeContextCUDA(PollyGPUContext *Context) {
- dump_function();
-
- CUDAContext *Ctx = (CUDAContext *)Context->Context;
- if (Ctx->Cuda) {
- CuCtxDestroyFcnPtr(Ctx->Cuda);
- free(Ctx);
- free(Context);
- }
-
- dlclose(HandleCuda);
- dlclose(HandleCudaRT);
-}
-
-#endif /* HAS_LIBCUDART */
-/******************************************************************************/
-/* API */
-/******************************************************************************/
-
-PollyGPUContext *polly_initContext() {
- DebugMode = getenv("POLLY_DEBUG") != 0;
- CacheMode = getenv("POLLY_NOCACHE") == 0;
-
- dump_function();
-
- PollyGPUContext *Context;
-
- switch (Runtime) {
-#ifdef HAS_LIBCUDART
- case RUNTIME_CUDA:
- Context = initContextCUDA();
- break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
- case RUNTIME_CL:
- Context = initContextCL();
- break;
-#endif /* HAS_LIBOPENCL */
- default:
- err_runtime();
- }
-
- return Context;
-}
-
-void polly_freeKernel(PollyGPUFunction *Kernel) {
- dump_function();
-
- switch (Runtime) {
-#ifdef HAS_LIBCUDART
- case RUNTIME_CUDA:
- freeKernelCUDA(Kernel);
- break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
- case RUNTIME_CL:
- freeKernelCL(Kernel);
- break;
-#endif /* HAS_LIBOPENCL */
- default:
- err_runtime();
- }
-}
-
-PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
- const char *KernelName) {
- dump_function();
-
- PollyGPUFunction *Function;
-
- switch (Runtime) {
-#ifdef HAS_LIBCUDART
- case RUNTIME_CUDA:
- Function = getKernelCUDA(BinaryBuffer, KernelName);
- break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
- case RUNTIME_CL:
- Function = getKernelCL(BinaryBuffer, KernelName);
- break;
-#endif /* HAS_LIBOPENCL */
- default:
- err_runtime();
- }
-
- return Function;
-}
-
-void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
- long MemSize) {
- dump_function();
-
- switch (Runtime) {
-#ifdef HAS_LIBCUDART
- case RUNTIME_CUDA:
- copyFromHostToDeviceCUDA(HostData, DevData, MemSize);
- break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
- case RUNTIME_CL:
- copyFromHostToDeviceCL(HostData, DevData, MemSize);
- break;
-#endif /* HAS_LIBOPENCL */
- default:
- err_runtime();
- }
-}
-
-void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
- long MemSize) {
- dump_function();
-
- switch (Runtime) {
-#ifdef HAS_LIBCUDART
- case RUNTIME_CUDA:
- copyFromDeviceToHostCUDA(DevData, HostData, MemSize);
- break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
- case RUNTIME_CL:
- copyFromDeviceToHostCL(DevData, HostData, MemSize);
- break;
-#endif /* HAS_LIBOPENCL */
- default:
- err_runtime();
- }
-}
-
-void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
- unsigned int GridDimY, unsigned int BlockDimX,
- unsigned int BlockDimY, unsigned int BlockDimZ,
- void **Parameters) {
- dump_function();
-
- switch (Runtime) {
-#ifdef HAS_LIBCUDART
- case RUNTIME_CUDA:
- launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
- BlockDimZ, Parameters);
- break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
- case RUNTIME_CL:
- launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ,
- Parameters);
- break;
-#endif /* HAS_LIBOPENCL */
- default:
- err_runtime();
- }
-}
-
-void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
- dump_function();
-
- switch (Runtime) {
-#ifdef HAS_LIBCUDART
- case RUNTIME_CUDA:
- freeDeviceMemoryCUDA(Allocation);
- break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
- case RUNTIME_CL:
- freeDeviceMemoryCL(Allocation);
- break;
-#endif /* HAS_LIBOPENCL */
- default:
- err_runtime();
- }
-}
-
-PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
- dump_function();
-
- PollyGPUDevicePtr *DevData;
-
- switch (Runtime) {
-#ifdef HAS_LIBCUDART
- case RUNTIME_CUDA:
- DevData = allocateMemoryForDeviceCUDA(MemSize);
- break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
- case RUNTIME_CL:
- DevData = allocateMemoryForDeviceCL(MemSize);
- break;
-#endif /* HAS_LIBOPENCL */
- default:
- err_runtime();
- }
-
- return DevData;
-}
-
void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
dump_function();
- void *DevPtr;
-
- switch (Runtime) {
-#ifdef HAS_LIBCUDART
- case RUNTIME_CUDA:
- DevPtr = getDevicePtrCUDA(Allocation);
- break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
- case RUNTIME_CL:
- DevPtr = getDevicePtrCL(Allocation);
- break;
-#endif /* HAS_LIBOPENCL */
- default:
- err_runtime();
- }
-
- return DevPtr;
-}
-
-void polly_synchronizeDevice() {
- dump_function();
-
- switch (Runtime) {
-#ifdef HAS_LIBCUDART
- case RUNTIME_CUDA:
- synchronizeDeviceCUDA();
- break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
- case RUNTIME_CL:
- synchronizeDeviceCL();
- break;
-#endif /* HAS_LIBOPENCL */
- default:
- err_runtime();
- }
+ return (void *)Allocation->Cuda;
}
void polly_freeContext(PollyGPUContext *Context) {
if (CacheMode)
return;
- switch (Runtime) {
-#ifdef HAS_LIBCUDART
- case RUNTIME_CUDA:
- freeContextCUDA(Context);
- break;
-#endif /* HAS_LIBCUDART */
-#ifdef HAS_LIBOPENCL
- case RUNTIME_CL:
- freeContextCL(Context);
- break;
-#endif /* HAS_LIBOPENCL */
- default:
- err_runtime();
+ if (Context->Cuda) {
+ CuCtxDestroyFcnPtr(Context->Cuda);
+ free(Context);
}
-}
-
-/* Initialize GPUJIT with CUDA as runtime library. */
-PollyGPUContext *polly_initContextCUDA() {
-#ifdef HAS_LIBCUDART
- Runtime = RUNTIME_CUDA;
- return polly_initContext();
-#else
- fprintf(stderr, "GPU Runtime was built without CUDA support.\n");
- exit(-1);
-#endif /* HAS_LIBCUDART */
-}
-/* Initialize GPUJIT with OpenCL as runtime library. */
-PollyGPUContext *polly_initContextCL() {
-#ifdef HAS_LIBOPENCL
- Runtime = RUNTIME_CL;
- return polly_initContext();
-#else
- fprintf(stderr, "GPU Runtime was built without OpenCL support.\n");
- exit(-1);
-#endif /* HAS_LIBOPENCL */
+ dlclose(HandleCuda);
+ dlclose(HandleCudaRT);
}