From b18751578421b6cc93f70a690d4902d28e18a1a4 Mon Sep 17 00:00:00 2001
From: Tobias Grosser <tobias@grosser.es>
Date: Thu, 4 Aug 2016 09:15:58 +0000
Subject: [PATCH] GPGPU: Cache PTX kernels

We always keep a number of already compiled kernels available to ensure to avoid
costly recompilation.

llvm-svn: 277707
---
 polly/tools/GPURuntime/GPUJIT.c | 56 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 4 deletions(-)

diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c
index 24320f8..d066d91 100644
--- a/polly/tools/GPURuntime/GPUJIT.c
+++ b/polly/tools/GPURuntime/GPUJIT.c
@@ -20,6 +20,7 @@
 #include <string.h>
 
 static int DebugMode;
+static int CacheMode;
 
 static void debug_print(const char *format, ...) {
   if (!DebugMode)
@@ -40,6 +41,7 @@ struct PollyGPUContextT {
 struct PollyGPUFunctionT {
   CUfunction Cuda;
   CUmodule CudaModule;
+  const char *PTXString;
 };
 
 struct PollyGPUDevicePtrT {
@@ -249,6 +251,11 @@ PollyGPUContext *polly_initContext() {
   char DeviceName[256];
   int DeviceCount = 0;
 
+  static __thread PollyGPUContext *CurrentContext = NULL;
+
+  if (CurrentContext)
+    return CurrentContext;
+
   /* Get API handles. */
   if (initialDeviceAPIs() == 0) {
     fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n");
@@ -282,13 +289,41 @@ PollyGPUContext *polly_initContext() {
   }
   CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device);
 
+  CacheMode = getenv("POLLY_NOCACHE") == 0;
+
+  if (CacheMode)
+    CurrentContext = Context;
+
   return Context;
 }
 
+static void freeKernel(PollyGPUFunction *Kernel) {
+  if (Kernel->CudaModule)
+    CuModuleUnloadFcnPtr(Kernel->CudaModule);
+
+  if (Kernel)
+    free(Kernel);
+}
+
+#define KERNEL_CACHE_SIZE 10
+
 PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
                                   const char *KernelName) {
   dump_function();
 
+  static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
+  static __thread int NextCacheItem = 0;
+
+  for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
+    // We exploit here the property that all Polly-ACC kernels are allocated
+    // as global constants, hence a pointer comparision is sufficient to
+    // determin equality.
+    if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) {
+      debug_print("  -> using cached kernel\n");
+      return KernelCache[i];
+    }
+  }
+
   PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
 
   if (Function == 0) {
@@ -361,17 +396,27 @@ PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
 
   CuLinkDestroyFcnPtr(LState);
 
+  Function->PTXString = PTXBuffer;
+
+  if (CacheMode) {
+    if (KernelCache[NextCacheItem])
+      freeKernel(KernelCache[NextCacheItem]);
+
+    KernelCache[NextCacheItem] = Function;
+
+    NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
+  }
+
   return Function;
 }
 
 void polly_freeKernel(PollyGPUFunction *Kernel) {
   dump_function();
 
-  if (Kernel->CudaModule)
-    CuModuleUnloadFcnPtr(Kernel->CudaModule);
+  if (CacheMode)
+    return;
 
-  if (Kernel)
-    free(Kernel);
+  freeKernel(Kernel);
 }
 
 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
@@ -448,6 +493,9 @@ void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
 void polly_freeContext(PollyGPUContext *Context) {
   dump_function();
 
+  if (CacheMode)
+    return;
+
   if (Context->Cuda) {
     CuCtxDestroyFcnPtr(Context->Cuda);
     free(Context);
-- 
2.7.4