From b18751578421b6cc93f70a690d4902d28e18a1a4 Mon Sep 17 00:00:00 2001 From: Tobias Grosser Date: Thu, 4 Aug 2016 09:15:58 +0000 Subject: [PATCH] GPGPU: Cache PTX kernels We always keep a number of already compiled kernels available to ensure to avoid costly recompilation. llvm-svn: 277707 --- polly/tools/GPURuntime/GPUJIT.c | 56 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c index 24320f8..d066d91 100644 --- a/polly/tools/GPURuntime/GPUJIT.c +++ b/polly/tools/GPURuntime/GPUJIT.c @@ -20,6 +20,7 @@ #include static int DebugMode; +static int CacheMode; static void debug_print(const char *format, ...) { if (!DebugMode) @@ -40,6 +41,7 @@ struct PollyGPUContextT { struct PollyGPUFunctionT { CUfunction Cuda; CUmodule CudaModule; + const char *PTXString; }; struct PollyGPUDevicePtrT { @@ -249,6 +251,11 @@ PollyGPUContext *polly_initContext() { char DeviceName[256]; int DeviceCount = 0; + static __thread PollyGPUContext *CurrentContext = NULL; + + if (CurrentContext) + return CurrentContext; + /* Get API handles. */ if (initialDeviceAPIs() == 0) { fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n"); @@ -282,13 +289,41 @@ PollyGPUContext *polly_initContext() { } CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device); + CacheMode = getenv("POLLY_NOCACHE") == 0; + + if (CacheMode) + CurrentContext = Context; + return Context; } +static void freeKernel(PollyGPUFunction *Kernel) { + if (Kernel->CudaModule) + CuModuleUnloadFcnPtr(Kernel->CudaModule); + + if (Kernel) + free(Kernel); +} + +#define KERNEL_CACHE_SIZE 10 + PollyGPUFunction *polly_getKernel(const char *PTXBuffer, const char *KernelName) { dump_function(); + static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; + static __thread int NextCacheItem = 0; + + for (long i = 0; i < KERNEL_CACHE_SIZE; i++) { + // We exploit here the property that all Polly-ACC kernels are allocated + // as global constants, hence a pointer comparision is sufficient to + // determin equality. + if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) { + debug_print(" -> using cached kernel\n"); + return KernelCache[i]; + } + } + PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); if (Function == 0) { @@ -361,17 +396,27 @@ PollyGPUFunction *polly_getKernel(const char *PTXBuffer, CuLinkDestroyFcnPtr(LState); + Function->PTXString = PTXBuffer; + + if (CacheMode) { + if (KernelCache[NextCacheItem]) + freeKernel(KernelCache[NextCacheItem]); + + KernelCache[NextCacheItem] = Function; + + NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE; + } + return Function; } void polly_freeKernel(PollyGPUFunction *Kernel) { dump_function(); - if (Kernel->CudaModule) - CuModuleUnloadFcnPtr(Kernel->CudaModule); + if (CacheMode) + return; - if (Kernel) - free(Kernel); + freeKernel(Kernel); } void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, @@ -448,6 +493,9 @@ void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) { void polly_freeContext(PollyGPUContext *Context) { dump_function(); + if (CacheMode) + return; + if (Context->Cuda) { CuCtxDestroyFcnPtr(Context->Cuda); free(Context); -- 2.7.4