From 0ae66a4b425602d1a1a1c04687e0b7c8deae3577 Mon Sep 17 00:00:00 2001
From: Benjamin Segovia <segovia.benjamin@gmail.com>
Date: Thu, 29 Mar 2012 17:26:36 +0000
Subject: [PATCH] Removed all direct dependencies to the intel driver (ie files
 contained in intel/) and replaced them by dependencies to the call backs.
 This should allow an easier integration with the performance simulator

---
 src/CMakeLists.txt                     |   6 +-
 src/cl_command_queue.c                 |  53 +++---
 src/cl_command_queue.h                 |  23 +--
 src/cl_command_queue_gen7.c            | 106 ++----------
 src/cl_context.c                       |  22 +--
 src/cl_context.h                       |   9 +-
 src/cl_driver.c                        |  48 ++++--
 src/cl_driver.cpp                      |   4 +-
 src/cl_driver.h                        | 151 ++++++++++++++++-
 src/cl_image.c                         |   4 -
 src/cl_kernel.c                        |  16 +-
 src/cl_kernel.h                        |   5 +-
 src/cl_mem.c                           |  48 +++---
 src/cl_mem.h                           |  10 +-
 src/intel/intel_driver.c               |  16 +-
 src/intel/intel_gpgpu.c                | 292 +++++++++++++++------------------
 src/intel/intel_gpgpu.h                | 119 +-------------
 src/sim/{sim_buffer.c => sim_driver.c} |  15 +-
 src/sim/{sim_buffer.h => sim_driver.h} |   7 +-
 19 files changed, 436 insertions(+), 518 deletions(-)
 rename src/sim/{sim_buffer.c => sim_driver.c} (94%)
 rename src/sim/{sim_buffer.h => sim_driver.h} (91%)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index facc990..d3517c1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,9 +18,9 @@ SET(OPENCL_SRC
     cl_context.c
     cl_command_queue.c
     cl_command_queue_gen7.c
-    cl_buffer.c
-    cl_buffer.cpp
-    sim/sim_buffer.c
+    cl_driver.c
+    cl_driver.cpp
+    sim/sim_driver.c
     intel/intel_gpgpu.c
     intel/intel_batchbuffer.c
     intel/intel_driver.c
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index cef1485..0be37c8 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -25,8 +25,7 @@
 #include "cl_mem.h"
 #include "cl_utils.h"
 #include "cl_alloc.h"
-#include "intel_bufmgr.h"
-#include "intel/intel_gpgpu.h"
+#include "cl_driver.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -42,8 +41,7 @@ cl_command_queue_new(cl_context ctx)
   queue->magic = CL_MAGIC_QUEUE_HEADER;
   queue->ref_n = 1;
   queue->ctx = ctx;
-  TRY_ALLOC_NO_ERR (queue->gpgpu,
-                    intel_gpgpu_new((struct intel_driver*) ctx->intel_drv));
+  TRY_ALLOC_NO_ERR (queue->gpgpu, cl_gpgpu_new(ctx->drv));
 
   /* Append the command queue in the list */
   pthread_mutex_lock(&ctx->queue_lock);
@@ -87,7 +85,7 @@ cl_command_queue_delete(cl_command_queue queue)
   }
   cl_mem_delete(queue->perf);
   cl_context_delete(queue->ctx);
-  intel_gpgpu_delete(queue->gpgpu);
+  cl_gpgpu_delete(queue->gpgpu);
   queue->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(queue);
 }
@@ -98,26 +96,26 @@ cl_command_queue_add_ref(cl_command_queue queue)
   atomic_inc(&queue->ref_n);
 }
 
-  LOCAL cl_int
+LOCAL cl_int
 cl_command_queue_bind_surface(cl_command_queue queue,
                               cl_kernel k,
                               char *curbe,
-                              drm_intel_bo **local, 
-                              drm_intel_bo **priv,
-                              drm_intel_bo **scratch,
+                              cl_buffer **local, 
+                              cl_buffer **priv,
+                              cl_buffer **scratch,
                               uint32_t local_sz)
 {
   cl_context ctx = queue->ctx;
-  intel_gpgpu_t *gpgpu = queue->gpgpu;
-  drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
-  drm_intel_bo *sync_bo = NULL;
+  cl_gpgpu *gpgpu = queue->gpgpu;
+  cl_buffer_mgr *bufmgr = cl_context_get_bufmgr(ctx);
+  cl_buffer *sync_bo = NULL;
   cl_int err = CL_SUCCESS;
 #if 0
   cl_context ctx = queue->ctx;
   intel_gpgpu_t *gpgpu = queue->gpgpu;
   drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
   cl_mem mem = NULL;
-  drm_intel_bo *bo = NULL, *sync_bo = NULL;
+  cl_buffer *bo = NULL, *sync_bo = NULL;
   const size_t max_thread = ctx->device->max_compute_unit;
   cl_int err = CL_SUCCESS;
   uint32_t i, index;
@@ -163,7 +161,7 @@ cl_command_queue_bind_surface(cl_command_queue queue,
     assert(k->patch.local_surf.offset % SURFACE_SZ == 0);
     index = k->patch.local_surf.offset / SURFACE_SZ;
     assert(index != MAX_SURFACES - 1);
-    *local = drm_intel_bo_alloc(bufmgr, "CL local surface", sz, 64);
+    *local = cl_buffer_alloc(bufmgr, "CL local surface", sz, 64);
     gpgpu_bind_buf(gpgpu, index, *local, cc_llc_l3);
   }
   else if (local)
@@ -178,7 +176,7 @@ cl_command_queue_bind_surface(cl_command_queue queue,
     assert(k->patch.private_surf.offset % SURFACE_SZ == 0);
     index = k->patch.private_surf.offset / SURFACE_SZ;
     assert(index != MAX_SURFACES - 1);
-    *priv = drm_intel_bo_alloc(bufmgr, "CL private surface", sz, 64);
+    *priv = cl_buffer_alloc(bufmgr, "CL private surface", sz, 64);
     gpgpu_bind_buf(gpgpu, index, *priv, cc_llc_l3);
   }
   else if(priv)
@@ -193,17 +191,17 @@ cl_command_queue_bind_surface(cl_command_queue queue,
     assert(k->patch.scratch.offset % SURFACE_SZ == 0);
     assert(index != MAX_SURFACES - 1);
     index = k->patch.scratch.offset / SURFACE_SZ;
-    *scratch = drm_intel_bo_alloc(bufmgr, "CL scratch surface", sz, 64);
+    *scratch = cl_buffer_alloc(bufmgr, "CL scratch surface", sz, 64);
     gpgpu_bind_buf(gpgpu, index, *scratch, cc_llc_l3);
   }
   else if (scratch)
     *scratch = NULL;
 #endif
   /* Now bind a bo used for synchronization */
-  sync_bo = drm_intel_bo_alloc(bufmgr, "sync surface", 64, 64);
-  gpgpu_bind_buf(gpgpu, MAX_SURFACES-1, sync_bo, cc_llc_l3);
+  sync_bo = cl_buffer_alloc(bufmgr, "sync surface", 64, 64);
+  cl_gpgpu_bind_buf(gpgpu, MAX_SURFACES-1, sync_bo, cc_llc_l3);
   if (queue->last_batch != NULL)
-    drm_intel_bo_unreference(queue->last_batch);
+    cl_buffer_unreference(queue->last_batch);
   queue->last_batch = sync_bo;
 
 // error:
@@ -212,9 +210,9 @@ cl_command_queue_bind_surface(cl_command_queue queue,
 }
 
 #if USE_FULSIM
-extern void drm_intel_bufmgr_gem_stop_aubfile(drm_intel_bufmgr*);
-extern void drm_intel_bufmgr_gem_set_aubfile(drm_intel_bufmgr*, FILE*);
-extern void aub_exec_dump_raw_file(drm_intel_bo*, size_t offset, size_t sz);
+extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr*);
+extern void drm_intel_bufmgr_gem_set_aubfile(cl_buffer_mgr*, FILE*);
+extern void aub_exec_dump_raw_file(cl_buffer*, size_t offset, size_t sz);
 
 static void
 cl_run_fulsim(void)
@@ -435,15 +433,14 @@ cl_command_queue_ND_range(cl_command_queue queue,
                           const size_t *global_wk_sz,
                           const size_t *local_wk_sz)
 {
-  intel_gpgpu_t *gpgpu = queue->gpgpu;
-  const int32_t ver = intel_gpgpu_version(gpgpu);
+  const int32_t ver = cl_driver_get_ver(queue->ctx->drv);
   cl_int err = CL_SUCCESS;
 
 #if USE_FULSIM
-  drm_intel_bufmgr *bufmgr = NULL;
+  cl_buffer_mgr *bufmgr = NULL;
   FILE *file = fopen("dump.aub", "wb");
   FATAL_IF (file == NULL, "Unable to open file dump.aub");
-  bufmgr = cl_context_get_intel_bufmgr(queue->ctx);
+  bufmgr = cl_context_get_bufmgr(queue->ctx);
   drm_intel_bufmgr_gem_set_aubfile(bufmgr, file);
 #endif /* USE_FULSIM */
 
@@ -469,8 +466,8 @@ cl_command_queue_finish(cl_command_queue queue)
 {
   if (queue->last_batch == NULL)
     return CL_SUCCESS;
-  drm_intel_bo_wait_rendering(queue->last_batch);
-  drm_intel_bo_unreference(queue->last_batch);
+  cl_buffer_wait_rendering(queue->last_batch);
+  cl_buffer_unreference(queue->last_batch);
   queue->last_batch = NULL;
   return CL_SUCCESS;
 }
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index c47350e..f03e61c 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -21,6 +21,7 @@
 #define __CL_COMMAND_QUEUE_H__
 
 #include "cl_internals.h"
+#include "cl_driver.h"
 #include "CL/cl.h"
 #include <stdint.h>
 
@@ -28,14 +29,14 @@ struct intel_gpgpu;
 
 /* Basically, this is a (kind-of) batch buffer */
 struct _cl_command_queue {
-  uint64_t magic;                  /* To identify it as a command queue */
-  volatile int ref_n;              /* We reference count this object */
-  cl_context ctx;                  /* Its parent context */
-  cl_command_queue prev, next;     /* We chain the command queues together */
-  struct intel_gpgpu *gpgpu;  /* Setup all GEN commands */
-  cl_mem perf;                     /* Where to put the perf counters */
-  cl_mem fulsim_out;               /* Fulsim will output this buffer */
-  struct _drm_intel_bo *last_batch;/* To synchronize using clFinish */
+  uint64_t magic;               /* To identify it as a command queue */
+  volatile int ref_n;           /* We reference count this object */
+  cl_context ctx;               /* Its parent context */
+  cl_command_queue prev, next;  /* We chain the command queues together */
+  cl_gpgpu *gpgpu;              /* Setup all GEN commands */
+  cl_mem perf;                  /* Where to put the perf counters */
+  cl_mem fulsim_out;            /* Fulsim will output this buffer */
+  cl_buffer *last_batch;        /* To synchronize using clFinish */
 };
 
 /* Allocate and initialize a new command queue. Also insert it in the list of
@@ -69,9 +70,9 @@ extern cl_int cl_command_queue_finish(cl_command_queue);
 extern cl_int cl_command_queue_bind_surface(cl_command_queue queue,
                                             cl_kernel k,
                                             char *curbe,
-                                            struct _drm_intel_bo **local, 
-                                            struct _drm_intel_bo **priv,
-                                            struct _drm_intel_bo **scratch,
+                                            cl_buffer **local,
+                                            cl_buffer **priv,
+                                            cl_buffer **scratch,
                                             uint32_t local_sz);
 
 #endif /* __CL_COMMAND_QUEUE_H__ */
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index ecf27dd..78af68f 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -25,8 +25,6 @@
 #include "cl_mem.h"
 #include "cl_utils.h"
 #include "cl_alloc.h"
-#include "intel_bufmgr.h"
-#include "intel/intel_gpgpu.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -90,11 +88,11 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
                                const size_t *local_wk_sz)
 {
   cl_context ctx = queue->ctx;
-  intel_gpgpu_t *gpgpu = queue->gpgpu;
+  cl_gpgpu *gpgpu = queue->gpgpu;
   char *curbe = NULL;        /* Does not include per-thread local IDs */
   char *final_curbe = NULL;  /* Includes them */
-  drm_intel_bo *private_bo = NULL, *scratch_bo = NULL;
-  genx_gpgpu_kernel_t kernel;
+  cl_buffer *private_bo = NULL, *scratch_bo = NULL;
+  cl_gpgpu_kernel_t kernel;
   const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
   size_t i, batch_sz = 0u, local_sz = 0u, thread_n = 0u, id_offset = 0u, cst_sz = 0u;
   cl_int err = CL_SUCCESS;
@@ -115,12 +113,12 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   kernel.cst_sz = cst_sz += 3 * 32;       /* Add local IDs (16 words) */
 
   /* Setup the kernel */
-  gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
+  cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
   if (queue->last_batch != NULL)
-    drm_intel_bo_unreference(queue->last_batch);
+    cl_buffer_unreference(queue->last_batch);
   queue->last_batch = NULL;
   cl_command_queue_bind_surface(queue, ker, curbe, NULL, &private_bo, &scratch_bo, 0);
-  gpgpu_states_setup(gpgpu, &kernel, 1);
+  cl_gpgpu_states_setup(gpgpu, &kernel, 1);
 
   /* CURBE step 2. Give the localID and upload it to video memory */
   TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
@@ -128,99 +126,19 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
     for (i = 0; i < thread_n; ++i)
       memcpy(final_curbe + cst_sz * i, curbe, cst_sz - 3*32);
   TRY (cl_set_local_ids, final_curbe, local_wk_sz, simd_sz, cst_sz, id_offset, thread_n);
-  gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
+  cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
 
   /* Start a new batch buffer */
   batch_sz = cl_kernel_compute_batch_sz(ker);
-  gpgpu_batch_reset(gpgpu, batch_sz);
-  gpgpu_batch_start(gpgpu);
+  cl_gpgpu_batch_reset(gpgpu, batch_sz);
+  cl_gpgpu_batch_start(gpgpu);
 
   /* Issue the GPGPU_WALKER command */
-  gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
+  cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
 
   /* Close the batch buffer and submit it */
-  gpgpu_batch_end(gpgpu, 0);
-  gpgpu_flush(gpgpu);
-
-#if 0
-  cl_context ctx = queue->ctx;
-  intel_gpgpu_t *gpgpu = queue->gpgpu;
-  drm_intel_bo *private_bo = NULL, *scratch_bo = NULL;
-  char *curbe = NULL;        /* Does not include per-thread local IDs */
-  char *final_curbe = NULL;  /* Includes them */
-  genx_gpgpu_kernel_t kernel;
-  //const size_t simd_sz = ker->patch.exec_env.largest_compiled_simd_sz;
-  const size_t simd_sz = 16;
-  size_t local_sz, batch_sz, cst_sz = ker->patch.curbe.sz;
-  size_t i, thread_n, id_offset;
-  cl_int err = CL_SUCCESS;
-
-  /* Setup kernel */
-  kernel.name = "OCL kernel";
-  kernel.grf_blocks = 128;
-  kernel.bin = ker->kernel_heap; // _PLASMA ; NULL
-  kernel.size = ker->kernel_heap_sz; // _PLASMA ; 0
-  kernel.bo = ker->bo;
-  kernel.barrierID = 0;
-  kernel.use_barrier = ker->patch.exec_env.has_barriers;
-  kernel.slm_sz = cl_kernel_local_memory_sz(ker);
-
-  /* All arguments must have been set */
-  TRY (cl_kernel_check_args, ker);
-
-  /* Check that the local work sizes are OK */
-  TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
-  //kernel.thread_n = thread_n = local_sz / simd_sz;
-  kernel.thread_n = thread_n = local_sz / simd_sz;
-
-  /* CURBE step 1. Allocate and fill fields shared by threads in workgroup */
-  if (cst_sz > 0) {
-    assert(ker->cst_buffer);
-    curbe = cl_kernel_create_cst_buffer(ker,
-                                        global_wk_off,
-                                        global_wk_sz,
-                                        local_wk_sz,
-                                        3,
-                                        thread_n);
-  }
-  id_offset = cst_sz = ALIGN(cst_sz, 32); /* Align the user data on 32 bytes */
-  kernel.cst_sz = cst_sz += 3 * 32;       /* Add local IDs (16 words) */
-
-  /* Setup the kernel */
-  gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
-  if (queue->last_batch != NULL)
-    drm_intel_bo_unreference(queue->last_batch);
-  queue->last_batch = NULL;
-  cl_command_queue_bind_surface(queue, ker, curbe, NULL, &private_bo, &scratch_bo, 0);
-  gpgpu_states_setup(gpgpu, &kernel, 1);
-
-  /* CURBE step 2. Give the localID and upload it to video memory */
-  TRY_ALLOC (final_curbe, (char*) cl_calloc(thread_n, cst_sz));
-  for (i = 0; i < thread_n; ++i)
-    memcpy(final_curbe + cst_sz * i, curbe, cst_sz);
-  TRY (cl_set_local_ids, final_curbe, local_wk_sz, simd_sz, cst_sz, id_offset, thread_n);
-  gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
-
-  /* Start a new batch buffer */
-  batch_sz = cl_kernel_compute_batch_sz(ker);
-  gpgpu_batch_reset(gpgpu, batch_sz);
-  gpgpu_batch_start(gpgpu);
-
-  /* Issue the GPGPU_WALKER command */
-  gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
-
-  /* Close the batch buffer and submit it */
-  gpgpu_batch_end(gpgpu, 0);
-  gpgpu_flush(gpgpu);
-
-error:
-  /* Release all temporary buffers */
-  if (private_bo) drm_intel_bo_unreference(private_bo);
-  if (scratch_bo) drm_intel_bo_unreference(scratch_bo);
-  cl_free(final_curbe);
-  cl_free(curbe);
-  return err;
-#endif
+  cl_gpgpu_batch_end(gpgpu, 0);
+  cl_gpgpu_flush(gpgpu);
 
 error:
   return err;
diff --git a/src/cl_context.c b/src/cl_context.c
index 2e8ff24..3d926b4 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -25,6 +25,7 @@
 #include "cl_mem.h"
 #include "cl_alloc.h"
 #include "cl_utils.h"
+#include "cl_driver.h"
 
 #include "CL/cl.h"
 
@@ -33,13 +34,6 @@
 #include <stdint.h>
 #include <assert.h>
 
-/* Do not include the full dependency */
-struct intel_driver;
-/* Get the command buffer interface */
-extern struct _drm_intel_bufmgr* intel_driver_get_bufmgr(struct intel_driver*);
-/* Get the Gen HW version */
-extern uint32_t intel_driver_get_ver(struct intel_driver*);
-
 static cl_int
 cl_context_properties_is_ok(const cl_context_properties *properties)
 {
@@ -130,10 +124,10 @@ cl_context_new(void)
   cl_context ctx = NULL;
 
   TRY_ALLOC_NO_ERR (ctx, CALLOC(struct _cl_context));
-  TRY_ALLOC_NO_ERR (ctx->intel_drv, cl_intel_driver_new());
+  TRY_ALLOC_NO_ERR (ctx->drv, cl_driver_new());
   ctx->magic = CL_MAGIC_CONTEXT_HEADER;
   ctx->ref_n = 1;
-  ctx->ver = intel_driver_get_ver(ctx->intel_drv);
+  ctx->ver = cl_driver_get_ver(ctx->drv);
   pthread_mutex_init(&ctx->program_lock, NULL);
   pthread_mutex_init(&ctx->queue_lock, NULL);
   pthread_mutex_init(&ctx->buffer_lock, NULL);
@@ -163,8 +157,8 @@ cl_context_delete(cl_context ctx)
   assert(ctx->queues == NULL);
   assert(ctx->programs == NULL);
   assert(ctx->buffers == NULL);
-  assert(ctx->intel_drv);
-  cl_intel_driver_delete(ctx->intel_drv);
+  assert(ctx->drv);
+  cl_driver_delete(ctx->drv);
   ctx->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(ctx);
 }
@@ -202,9 +196,9 @@ error:
   goto exit;
 }
 
-struct _drm_intel_bufmgr*
-cl_context_get_intel_bufmgr(cl_context ctx)
+struct cl_buffer_mgr*
+cl_context_get_bufmgr(cl_context ctx)
 {
-  return intel_driver_get_bufmgr((struct intel_driver*) ctx->intel_drv);
+  return cl_driver_get_bufmgr(ctx->drv);
 }
 
diff --git a/src/cl_context.h b/src/cl_context.h
index f2c6302..27f6d87 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -21,6 +21,7 @@
 #define __CL_CONTEXT_H__
 
 #include "cl_internals.h"
+#include "cl_driver.h"
 #include "CL/cl.h"
 
 #include <stdint.h>
@@ -33,7 +34,7 @@ struct intel_driver;
 struct _cl_context {
   uint64_t magic;                   /* To identify it as a context */
   volatile int ref_n;               /* We reference count this object */
-  struct intel_driver *intel_drv;   /* Handles the real HW */
+  cl_driver *drv;                   /* Handles HW or simulator */
   cl_device_id device;              /* All information about the GPU device */
   cl_command_queue queues;          /* All command queues currently allocated */
   cl_program programs;              /* All programs currently allocated */
@@ -69,9 +70,6 @@ extern cl_command_queue cl_context_create_queue(cl_context,
                                                 cl_command_queue_properties,
                                                 cl_int*);
 
-/* Use for all GPU buffers */
-extern struct _drm_intel_bufmgr* cl_context_get_intel_bufmgr(cl_context);
-
 /* Enqueue a ND Range kernel */
 extern cl_int cl_context_ND_kernel(cl_context,
                                    cl_command_queue,
@@ -82,8 +80,7 @@ extern cl_int cl_context_ND_kernel(cl_context,
                                    const size_t*);
 
 /* Used for allocation */
-extern struct _drm_intel_bufmgr*
-cl_context_get_intel_bufmgr(cl_context ctx);
+extern cl_buffer_mgr* cl_context_get_bufmgr(cl_context ctx);
 
 #endif /* __CL_CONTEXT_H__ */
 
diff --git a/src/cl_driver.c b/src/cl_driver.c
index ed2aaa0..65828e2 100644
--- a/src/cl_driver.c
+++ b/src/cl_driver.c
@@ -17,17 +17,43 @@
  * Author: Benjamin Segovia <benjamin.segovia@intel.com>
  */
 
-#include "cl_buffer.h"
+#include "cl_driver.h"
+#include "cl_utils.h"
 #include <stdlib.h>
 
-cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
-cl_buffer_unreference_cb *cl_buffer_unreference = NULL;
-cl_buffer_map_cb *cl_buffer_map = NULL;
-cl_buffer_unmap_cb *cl_buffer_unmap = NULL;
-cl_buffer_pin_cb *cl_buffer_pin = NULL;
-cl_buffer_unpin_cb *cl_buffer_unpin = NULL;
-cl_buffer_subdata_cb *cl_buffer_subdata = NULL;
-cl_buffer_emit_reloc_cb *cl_buffer_emit_reloc = NULL;
-cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr = NULL;
-cl_driver_get_ver_cb *cl_driver_get_ver = NULL;
+/* Driver */
+LOCAL cl_driver_new_cb *cl_driver_new = NULL;
+LOCAL cl_driver_delete_cb *cl_driver_delete = NULL;
+LOCAL cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr = NULL;
+
+/* Buffer */
+LOCAL cl_driver_get_ver_cb *cl_driver_get_ver = NULL;
+LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
+LOCAL cl_buffer_reference_cb *cl_buffer_reference = NULL;
+LOCAL cl_buffer_unreference_cb *cl_buffer_unreference = NULL;
+LOCAL cl_buffer_map_cb *cl_buffer_map = NULL;
+LOCAL cl_buffer_unmap_cb *cl_buffer_unmap = NULL;
+LOCAL cl_buffer_get_virtual_cb *cl_buffer_get_virtual = NULL;
+LOCAL cl_buffer_pin_cb *cl_buffer_pin = NULL;
+LOCAL cl_buffer_unpin_cb *cl_buffer_unpin = NULL;
+LOCAL cl_buffer_subdata_cb *cl_buffer_subdata = NULL;
+LOCAL cl_buffer_emit_reloc_cb *cl_buffer_emit_reloc = NULL;
+LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
+
+/* GPGPU */
+LOCAL cl_gpgpu_new_cb *cl_gpgpu_new = NULL;
+LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
+LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
+LOCAL cl_gpgpu_bind_image2D_cb *cl_gpgpu_bind_image2D = NULL;
+LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
+LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
+LOCAL cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants = NULL;
+LOCAL cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup = NULL;
+LOCAL cl_gpgpu_update_barrier_cb *cl_gpgpu_update_barrier = NULL;
+LOCAL cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers = NULL;
+LOCAL cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset = NULL;
+LOCAL cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start = NULL;
+LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL;
+LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL;
+LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
 
diff --git a/src/cl_driver.cpp b/src/cl_driver.cpp
index 87e1f2e..df26518 100644
--- a/src/cl_driver.cpp
+++ b/src/cl_driver.cpp
@@ -18,8 +18,8 @@
  */
 
 extern "C" {
-#include "sim/sim_buffer.h"
-#include "intel/intel_buffer.h"
+#include "sim/sim_driver.h"
+#include "intel/intel_driver.h"
 #include "cl_utils.h"
 #include <stdlib.h>
 #include <string.h>
diff --git a/src/cl_driver.h b/src/cl_driver.h
index ff9fa6f..116959e 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -21,11 +21,18 @@
 #define __CL_BUFFER_H__
 
 #include <stdint.h>
+#include <stdlib.h>
 
-/* Hide behind some call backs the buffer allocation / deallocation ... This
+/* XXX needed for previous driver */
+#define MAX_SURFACES 128
+#define MAX_SAMPLERS 16
+
+/**************************************************************************
+ * cl_driver:
+ * Hide behind some call backs the buffer allocation / deallocation ... This
  * will allow us to make the use of a software performance simulator easier and
  * to minimize the code specific for the HW and for the simulator
- */
+ **************************************************************************/
 
 /* Encapsulates command buffer / data buffer / kernels */
 typedef struct cl_buffer cl_buffer;
@@ -36,13 +43,19 @@ typedef struct cl_buffer_mgr cl_buffer_mgr;
 /* Encapsulates the driver backend functionalities */
 typedef struct cl_driver cl_driver;
 
+/* Encapsulates the gpgpu stream of commands */
+typedef struct cl_gpgpu cl_gpgpu;
+
+/**************************************************************************
+ * Driver
+ **************************************************************************/
 /* Create a new driver */
 typedef cl_driver* (cl_driver_new_cb)(void);
-extern cl_driver_new_cb cl_driver_new;
+extern cl_driver_new_cb *cl_driver_new;
 
 /* Delete the driver */
-typedef cl_driver* (cl_driver_delete_cb)(void);
-extern cl_driver_delete_cb cl_driver_delete;
+typedef void (cl_driver_delete_cb)(cl_driver*);
+extern cl_driver_delete_cb *cl_driver_delete;
 
 /* Get the buffer manager from the driver */
 typedef cl_buffer_mgr* (cl_driver_get_bufmgr_cb)(cl_driver*);
@@ -52,6 +65,116 @@ extern cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr;
 typedef uint32_t (cl_driver_get_ver_cb)(cl_driver*);
 extern cl_driver_get_ver_cb *cl_driver_get_ver;
 
+/**************************************************************************
+ * GPGPU command streamer
+ **************************************************************************/
+/* Describe texture tiling */
+typedef enum cl_gpgpu_tiling {
+  GPGPU_NO_TILE = 0,
+  GPGPU_TILE_X  = 1,
+  GPGPU_TILE_Y  = 2,
+} cl_gpgpu_tiling_t;
+
+/* Cache control options */
+enum gen6_cache_control {
+  cc_gtt      = 0x0,
+  cc_l3       = 0x1,
+  cc_llc      = 0x2,
+  cc_llc_l3   = 0x3
+};
+
+
+/* Use this structure to bind kernels in the gpgpu state */
+typedef struct cl_gpgpu_kernel {
+  const char *name;        /* kernel name and bo name */
+  uint32_t grf_blocks;     /* register blocks kernel wants (in 8 reg blocks) */
+  uint32_t cst_sz;         /* total size of all constants */
+  cl_buffer *bo;           /* kernel code in the proper addr space */
+  int32_t barrierID;       /* barrierID for _this_ kernel */
+  uint32_t use_barrier:1;  /* For gen7 (automatic barrier management) */
+  uint32_t thread_n:15;    /* For gen7 (automatic barrier management) */
+  uint32_t slm_sz:16;      /* For gen7 (automatic SLM allocation) */
+} cl_gpgpu_kernel_t;
+
+/* Create a new gpgpu state */
+typedef cl_gpgpu* (cl_gpgpu_new_cb)(cl_driver*);
+extern cl_gpgpu_new_cb *cl_gpgpu_new;
+
+/* Delete the gpgpu state */
+typedef void (cl_gpgpu_delete_cb)(cl_gpgpu*);
+extern cl_gpgpu_delete_cb *cl_gpgpu_delete;
+
+/* Bind a regular unformatted buffer */
+typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu*, int32_t index, cl_buffer *, uint32_t cchint);
+extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
+
+/* Set a 2d texture */
+typedef void (cl_gpgpu_bind_image2D_cb)(cl_gpgpu state,
+                                        int32_t index,
+                                        struct cl_buffer* obj_bo,
+                                        uint32_t format,
+                                        int32_t w,
+                                        int32_t h,
+                                        int pitch,
+                                        cl_gpgpu_tiling_t tiling);
+extern cl_gpgpu_bind_image2D_cb *cl_gpgpu_bind_image2D;
+
+/* Configure internal state */
+typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu*, uint32_t max_threads, uint32_t size_cs_entry);
+extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
+
+/* Set the buffer object where to report performance counters */
+typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu*, struct cl_buffer *perf);
+extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters;
+
+/* Fills current constant buffer with data */
+typedef void (cl_gpgpu_upload_constants_cb)(cl_gpgpu*, const void* data, uint32_t size);
+extern cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants;
+
+/* Setup all indirect states */
+typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu*, cl_gpgpu_kernel_t* kernel, uint32_t ker_n);
+extern cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup;
+
+/* Make HW threads use barrierID */
+typedef void (cl_gpgpu_update_barrier_cb)(cl_gpgpu*, uint32_t barrierID, uint32_t thread_n);
+extern cl_gpgpu_update_barrier_cb *cl_gpgpu_update_barrier;
+
+/* Upload the constant samplers as specified inside the OCL kernel */
+typedef void (cl_gpgpu_upload_samplers_cb)(cl_gpgpu *state, const void *data, uint32_t n);
+extern cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers;
+
+/* Set a sampler */
+typedef void (cl_gpgpu_set_sampler_cb)(cl_gpgpu*, uint32_t index, uint32_t non_normalized);
+extern cl_gpgpu_set_sampler_cb *cl_gpgpu_set_sampler;
+
+/* Allocate the batch buffer and return the BO used for the batch buffer */
+typedef void (cl_gpgpu_batch_reset_cb)(cl_gpgpu*, size_t sz);
+extern cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset;
+
+/* Atomic begin, pipeline select, urb, pipeline state and constant buffer */
+typedef void (cl_gpgpu_batch_start_cb)(cl_gpgpu*);
+extern cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start;
+
+/* atomic end with possibly inserted flush */
+typedef void (cl_gpgpu_batch_end_cb)(cl_gpgpu*, int32_t flush_mode);
+extern cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end;
+
+/* Flush the command buffer */
+typedef void (cl_gpgpu_flush_cb)(cl_gpgpu *state);
+extern cl_gpgpu_flush_cb *cl_gpgpu_flush;
+
+/* Will spawn all threads */
+typedef void (cl_gpgpu_walker_cb)(cl_gpgpu *state,
+                                  uint32_t simd_sz,
+                                  uint32_t thread_n,
+                                  const size_t global_wk_off[3],
+                                  const size_t global_wk_sz[3],
+                                  const size_t local_wk_sz[3]);
+extern cl_gpgpu_walker_cb *cl_gpgpu_walker;
+
+/**************************************************************************
+ * Buffer
+ **************************************************************************/
 /* Allocate a buffer */
 typedef cl_buffer* (cl_buffer_alloc_cb)(cl_buffer_mgr*, const char*, unsigned long, unsigned long);
 extern cl_buffer_alloc_cb *cl_buffer_alloc;
@@ -60,16 +183,24 @@ extern cl_buffer_alloc_cb *cl_buffer_alloc;
 typedef void (cl_buffer_unreference_cb)(cl_buffer*);
 extern cl_buffer_unreference_cb *cl_buffer_unreference;
 
+/* Add one more ref on a buffer */
+typedef void (cl_buffer_reference_cb)(cl_buffer*);
+extern cl_buffer_reference_cb *cl_buffer_reference;
+
 /* Map a buffer */
-typedef void* (cl_buffer_map_cb)(cl_buffer*);
+typedef int (cl_buffer_map_cb)(cl_buffer*, uint32_t write_enable);
 extern cl_buffer_map_cb *cl_buffer_map;
 
 /* Unmap a buffer */
-typedef void* (cl_buffer_unmap_cb)(cl_buffer*);
+typedef int (cl_buffer_unmap_cb)(cl_buffer*);
 extern cl_buffer_unmap_cb *cl_buffer_unmap;
 
+/* Get the virtual address (when mapped) */
+typedef void* (cl_buffer_get_virtual_cb)(cl_buffer*);
+extern cl_buffer_get_virtual_cb *cl_buffer_get_virtual;
+
 /* Pin a buffer */
-typedef int (cl_buffer_pin_cb)(cl_buffer*);
+typedef int (cl_buffer_pin_cb)(cl_buffer*, uint32_t alignment);
 extern cl_buffer_pin_cb *cl_buffer_pin;
 
 /* Unpin a buffer */
@@ -84,5 +215,9 @@ extern cl_buffer_subdata_cb *cl_buffer_subdata;
 typedef int (cl_buffer_emit_reloc_cb) (cl_buffer *, uint32_t, cl_buffer*, uint32_t, uint32_t, uint32_t);
 extern cl_buffer_emit_reloc_cb *cl_buffer_emit_reloc;
 
+/* Wait for all pending rendering for this buffer to complete */
+typedef int (cl_buffer_wait_rendering_cb) (cl_buffer *);
+extern cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering;
+
 #endif /* __CL_BUFFER_H__ */
 
diff --git a/src/cl_image.c b/src/cl_image.c
index 143c9fb..bba741d 100644
--- a/src/cl_image.c
+++ b/src/cl_image.c
@@ -19,11 +19,7 @@
 
 #include "cl_image.h"
 #include "cl_utils.h"
-#ifdef _PLASMA
-#include "plasma/intel_defines.h"
-#else
 #include "intel/intel_defines.h"
-#endif
 
 #include <assert.h>
 
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index bc63fb8..6214aea 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -25,8 +25,6 @@
 #include "cl_alloc.h"
 #include "cl_utils.h"
 #include "CL/cl.h"
-#include "intel_bufmgr.h"
-#include "intel/intel_gpgpu.h"
 #include "gen/program.h"
 
 #include <stdio.h>
@@ -45,8 +43,8 @@ cl_kernel_delete(cl_kernel k)
   if (atomic_dec(&k->ref_n) > 1) return;
 
   /* Release one reference on all bos we own */
-  if (k->bo)       drm_intel_bo_unreference(k->bo);
-  if (k->const_bo) drm_intel_bo_unreference(k->const_bo);
+  if (k->bo)       cl_buffer_unreference(k->bo);
+  if (k->const_bo) cl_buffer_unreference(k->const_bo);
 
   /* This will be true for kernels created by clCreateKernel */
   if (k->ref_its_program) cl_program_delete(k->program);
@@ -104,15 +102,15 @@ LOCAL void
 cl_kernel_setup(cl_kernel k, const struct GenKernel *gen_kernel)
 {
   cl_context ctx = k->program->ctx;
-  drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
+  cl_buffer_mgr *bufmgr = cl_context_get_bufmgr(ctx);
 
   /* Allocate the gen code here */
   const uint32_t code_sz = GenKernelGetCodeSize(gen_kernel);
   const char *code = GenKernelGetCode(gen_kernel);
-  k->bo = drm_intel_bo_alloc(bufmgr, "CL kernel", code_sz, 64u);
+  k->bo = cl_buffer_alloc(bufmgr, "CL kernel", code_sz, 64u);
 
   /* Upload the code */
-  drm_intel_bo_subdata(k->bo, 0, code_sz, code);
+  cl_buffer_subdata(k->bo, 0, code_sz, code);
   k->gen_kernel = gen_kernel;
 }
 
@@ -132,8 +130,8 @@ cl_kernel_dup(const cl_kernel from)
   to->program = from->program;
 
   /* Retain the bos */
-  if (from->bo)       drm_intel_bo_reference(from->bo);
-  if (from->const_bo) drm_intel_bo_reference(from->const_bo);
+  if (from->bo)       cl_buffer_reference(from->bo);
+  if (from->const_bo) cl_buffer_reference(from->const_bo);
 
   /* We retain the program destruction since this kernel (user allocated)
    * depends on the program for some of its pointers
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index f5a7876..fbc17ca 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -22,6 +22,7 @@
 
 #include "cl_defs.h"
 #include "cl_internals.h"
+#include "cl_driver.h"
 #include "CL/cl.h"
 
 #include <stdint.h>
@@ -34,8 +35,8 @@ struct GenKernel;
 struct _cl_kernel {
   uint64_t magic;                     /* To identify it as a kernel */
   volatile int ref_n;                 /* We reference count this object */
-  struct _drm_intel_bo *bo;           /* The code itself */
-  struct _drm_intel_bo *const_bo;     /* Buffer for all __constants values in the OCL program */
+  struct cl_buffer *bo;               /* The code itself */
+  struct cl_buffer *const_bo;         /* Buffer for all __constants values in the OCL program */
   cl_program program;                 /* Owns this structure (and pointers) */
   const struct GenKernel *gen_kernel; /* (Opaque) compiler structure for the OCL kernel */
   uint8_t ref_its_program;            /* True only for the user kernel (those created by clCreateKernel) */
diff --git a/src/cl_mem.c b/src/cl_mem.c
index a1aa53b..2d29b9d 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -23,11 +23,7 @@
 #include "cl_utils.h"
 #include "cl_alloc.h"
 #include "cl_device_id.h"
-
-#include "intel/intel_driver.h"
-#include "intel/intel_gpgpu.h"
-#include "intel_bufmgr.h" /* libdrm_intel */
-#include "cl_buffer.h"
+#include "cl_driver.h"
 
 #include "CL/cl.h"
 #include "CL/cl_intel.h"
@@ -41,7 +37,7 @@ cl_mem_allocate(cl_context ctx,
                 cl_int is_tiled,
                 cl_int *errcode)
 {
-  drm_intel_bufmgr *bufmgr = NULL;
+  cl_buffer_mgr *bufmgr = NULL;
   cl_mem mem = NULL;
   cl_int err = CL_SUCCESS;
   size_t alignment = 64;
@@ -67,9 +63,9 @@ cl_mem_allocate(cl_context ctx,
     alignment = 4096;
 
   /* Allocate space in memory */
-  bufmgr = cl_context_get_intel_bufmgr(ctx);
+  bufmgr = cl_context_get_bufmgr(ctx);
   assert(bufmgr);
-  mem->bo = drm_intel_bo_alloc(bufmgr, "CL memory object", sz, alignment);
+  mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
   if (UNLIKELY(mem->bo == NULL)) {
     err = CL_MEM_ALLOCATION_FAILURE;
     goto error;
@@ -119,7 +115,7 @@ cl_mem_new(cl_context ctx,
 
   /* Copy the data if required */
   if (flags & CL_MEM_COPY_HOST_PTR) /* TODO check other flags too */
-    drm_intel_bo_subdata(mem->bo, 0, sz, data);
+    cl_buffer_subdata(mem->bo, 0, sz, data);
 
 exit:
   if (errcode_ret)
@@ -141,8 +137,8 @@ cl_mem_copy_data_linear(cl_mem mem,
 {
   size_t x, y, p;
   char *dst;
-  drm_intel_bo_map(mem->bo, 1);
-  dst = drm_intel_bo_get_virtual(mem->bo);
+  cl_buffer_map(mem->bo, 1);
+  dst = cl_buffer_get_virtual(mem->bo);
   for (y = 0; y < h; ++y) {
     char *src = (char*) data + pitch * y;
     for (x = 0; x < w; ++x) {
@@ -152,7 +148,7 @@ cl_mem_copy_data_linear(cl_mem mem,
       src += bpp;
     }
   }
-  drm_intel_bo_unmap(mem->bo);
+  cl_buffer_unmap(mem->bo);
 }
 
 static const uint32_t tile_sz = 4096; /* 4KB per tile */
@@ -179,8 +175,8 @@ cl_mem_copy_data_tilex(cl_mem mem,
   char *img = NULL;
   char *end = (char*) data + pitch * h;
 
-  drm_intel_bo_map(mem->bo, 1);
-  img = drm_intel_bo_get_virtual(mem->bo);
+  cl_buffer_map(mem->bo, 1);
+  img = cl_buffer_get_virtual(mem->bo);
   for (tiley = 0; tiley < tiley_n; ++tiley)
   for (tilex = 0; tilex < tilex_n; ++tilex) {
     char *tile = img + (tilex + tiley * tilex_n) * tile_sz;
@@ -193,7 +189,7 @@ cl_mem_copy_data_tilex(cl_mem mem,
       }
     }
   }
-  drm_intel_bo_unmap(mem->bo);
+  cl_buffer_unmap(mem->bo);
 }
 
 static void
@@ -214,8 +210,8 @@ cl_mem_copy_data_tiley(cl_mem mem,
   char *img = NULL;
   char *end = (char*) data + pitch * h;
 
-  drm_intel_bo_map(mem->bo, 1);
-  img = drm_intel_bo_get_virtual(mem->bo);
+  cl_buffer_map(mem->bo, 1);
+  img = cl_buffer_get_virtual(mem->bo);
   for (tiley = 0; tiley < tiley_n; ++tiley)
   for (tilex = 0; tilex < tilex_n; ++tilex) {
     char *tile = img + (tiley * tilex_n + tilex) * tile_sz;
@@ -229,7 +225,7 @@ cl_mem_copy_data_tiley(cl_mem mem,
       }
     }
   }
-  drm_intel_bo_unmap(mem->bo);
+  cl_buffer_unmap(mem->bo);
 }
 
 LOCAL cl_mem
@@ -279,7 +275,7 @@ cl_mem_new_image2D(cl_context ctx,
 #undef DO_IMAGE_ERROR
 
   /* Pick up tiling mode (we do only linear on SNB) */
-  if (ctx->intel_drv->gen_ver != 6)
+  if (cl_driver_get_ver(ctx->drv) != 6)
     tiling = CL_TILE_Y;
 
   /* Tiling requires to align both pitch and height */
@@ -336,7 +332,7 @@ cl_mem_delete(cl_mem mem)
   if (atomic_dec(&mem->ref_n) > 1)
     return;
   if (LIKELY(mem->bo != NULL))
-    drm_intel_bo_unreference(mem->bo);
+    cl_buffer_unreference(mem->bo);
 
   /* Remove it from the list */
   assert(mem->ctx);
@@ -363,15 +359,15 @@ cl_mem_add_ref(cl_mem mem)
 LOCAL void*
 cl_mem_map(cl_mem mem)
 {
-  drm_intel_bo_map(mem->bo, 1);
-  assert(drm_intel_bo_get_virtual(mem->bo));
-  return drm_intel_bo_get_virtual(mem->bo);
+  cl_buffer_map(mem->bo, 1);
+  assert(cl_buffer_get_virtual(mem->bo));
+  return cl_buffer_get_virtual(mem->bo);
 }
 
 LOCAL cl_int
 cl_mem_unmap(cl_mem mem)
 {
-  drm_intel_bo_unmap(mem->bo);
+  cl_buffer_unmap(mem->bo);
   return CL_SUCCESS;
 }
 
@@ -381,7 +377,7 @@ cl_mem_pin(cl_mem mem)
   assert(mem);
   if (UNLIKELY((mem->flags & CL_MEM_PINNABLE) == 0))
     return CL_INVALID_MEM;
-  drm_intel_bo_pin(mem->bo, 4096);
+  cl_buffer_pin(mem->bo, 4096);
   return CL_SUCCESS;
 }
 
@@ -391,7 +387,7 @@ cl_mem_unpin(cl_mem mem)
   assert(mem);
   if (UNLIKELY((mem->flags & CL_MEM_PINNABLE) == 0))
     return CL_INVALID_MEM;
-  drm_intel_bo_unpin(mem->bo);
+  cl_buffer_unpin(mem->bo);
   return CL_SUCCESS;
 }
 
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 1177a76..00dcd93 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -21,11 +21,9 @@
 #define __CL_MEM_H__
 
 #include "cl_internals.h"
+#include "cl_driver.h"
 #include "CL/cl.h"
 
-/* Store the object in video memory */
-struct _drm_intel_bo;
-
 typedef enum cl_image_tiling {
   CL_NO_TILE = 0,
   CL_TILE_X  = 1,
@@ -36,7 +34,7 @@ typedef enum cl_image_tiling {
 struct _cl_mem {
   uint64_t magic;           /* To identify it as a memory object */
   volatile int ref_n;       /* This object is reference counted */
-  struct _drm_intel_bo *bo; /* Data in GPU memory */
+  cl_buffer *bo;            /* Data in GPU memory */
   cl_mem prev, next;        /* We chain the memory buffers together */
   cl_context ctx;           /* Context it belongs to */
   cl_mem_flags flags;       /* Flags specified at the creation time */
@@ -67,10 +65,10 @@ extern void cl_mem_delete(cl_mem);
 /* Add one more reference to this object */
 extern void cl_mem_add_ref(cl_mem);
 
-/* Directly map a memory object (just use drm_intel_bo_map) */
+/* Directly map a memory object */
 extern void *cl_mem_map(cl_mem);
 
-/* Unmap a memory object (just use drm_intel_bo_unmap) */
+/* Unmap a memory object */
 extern cl_int cl_mem_unmap(cl_mem);
 
 /* Pin/unpin the buffer in memory (you must be root) */
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index f151d92..def75f1 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -18,6 +18,7 @@
  */
 
 #include "intel_driver.h"
+#include "intel_gpgpu.h"
 #include "intel_batchbuffer.h"
 #include "x11/dricommon.h"
 
@@ -32,6 +33,7 @@
 
 #include "cl_utils.h"
 #include "cl_alloc.h"
+#include "cl_driver.h"
 #include "cl_genx_driver.h"
 
 #define SET_BLOCKED_SIGSET(DRIVER)   do {                     \
@@ -59,7 +61,6 @@
   RESTORE_BLOCKED_SIGSET(DRIVER);                             \
 } while (0)
 
-
 LOCAL intel_driver_t*
 intel_driver_new(void)
 {
@@ -340,18 +341,27 @@ intel_driver_get_ver(struct intel_driver *drv)
   return drv->gen_ver;
 }
 
+LOCAL uint32_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; }
+LOCAL void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; }
+
 LOCAL void
 intel_setup_callbacks(void)
 {
+  cl_driver_new = (cl_driver_new_cb *) cl_intel_driver_new;
+  cl_driver_delete = (cl_driver_delete_cb *) cl_intel_driver_delete;
+  cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver;
+  cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
   cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
+  cl_buffer_reference = (cl_buffer_reference_cb *) drm_intel_bo_reference;
   cl_buffer_unreference = (cl_buffer_unreference_cb *) drm_intel_bo_unreference;
   cl_buffer_map = (cl_buffer_map_cb *) drm_intel_bo_map;
   cl_buffer_unmap = (cl_buffer_unmap_cb *) drm_intel_bo_unmap;
+  cl_buffer_get_virtual = (cl_buffer_get_virtual_cb *) drm_intel_bo_unmap;
   cl_buffer_pin = (cl_buffer_pin_cb *) drm_intel_bo_pin;
   cl_buffer_unpin = (cl_buffer_unpin_cb *) drm_intel_bo_unpin;
   cl_buffer_subdata = (cl_buffer_subdata_cb *) drm_intel_bo_subdata;
   cl_buffer_emit_reloc = (cl_buffer_emit_reloc_cb *) drm_intel_bo_emit_reloc;
-  cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
-  cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver;
+  cl_buffer_wait_rendering = (cl_buffer_wait_rendering_cb *) drm_intel_bo_wait_rendering;
+  intel_set_gpgpu_callbacks();
 }
 
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 7f01f61..a82ba6e 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -58,7 +58,7 @@ struct intel_gpgpu
 {
   intel_driver_t *drv;
   intel_batchbuffer_t *batch;
-  genx_gpgpu_kernel_t *ker;
+  cl_gpgpu_kernel_t *ker;
 
   struct {
     dri_bo *bo;
@@ -78,26 +78,9 @@ struct intel_gpgpu
   uint32_t max_threads;      /* max threads requested by the user */
 };
 
-LOCAL intel_gpgpu_t*
-intel_gpgpu_new(intel_driver_t *drv)
-{
-  intel_gpgpu_t *state = NULL;
+typedef struct intel_gpgpu intel_gpgpu_t;
 
-  TRY_ALLOC_NO_ERR (state, CALLOC(intel_gpgpu_t));
-  state->drv = drv;
-  state->batch = intel_batchbuffer_new(state->drv);
-  assert(state->batch);
-  intel_batchbuffer_init(state->batch, state->drv);
-
-exit:
-  return state;
-error:
-  intel_gpgpu_delete(state);
-  state = NULL;
-  goto exit;
-}
-
-LOCAL void
+static void
 intel_gpgpu_delete(intel_gpgpu_t *state)
 {
   if (state == NULL)
@@ -118,8 +101,27 @@ intel_gpgpu_delete(intel_gpgpu_t *state)
   cl_free(state);
 }
 
+static intel_gpgpu_t*
+intel_gpgpu_new(intel_driver_t *drv)
+{
+  intel_gpgpu_t *state = NULL;
+
+  TRY_ALLOC_NO_ERR (state, CALLOC(intel_gpgpu_t));
+  state->drv = drv;
+  state->batch = intel_batchbuffer_new(state->drv);
+  assert(state->batch);
+  intel_batchbuffer_init(state->batch, state->drv);
+
+exit:
+  return state;
+error:
+  intel_gpgpu_delete(state);
+  state = NULL;
+  goto exit;
+}
+
 static void
-gpgpu_select_pipeline(intel_gpgpu_t *state)
+intel_gpgpu_select_pipeline(intel_gpgpu_t *state)
 {
   BEGIN_BATCH(state->batch, 1);
   OUT_BATCH(state->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
@@ -127,7 +129,7 @@ gpgpu_select_pipeline(intel_gpgpu_t *state)
 }
 
 static void
-gpgpu_set_base_address(intel_gpgpu_t *state)
+intel_gpgpu_set_base_address(intel_gpgpu_t *state)
 {
   const uint32_t def_cc = cc_llc_l3; /* default Cache Control value */
   BEGIN_BATCH(state->batch, 10);
@@ -162,7 +164,7 @@ gpgpu_set_base_address(intel_gpgpu_t *state)
 }
 
 static void
-gpgpu_load_vfe_state(intel_gpgpu_t *state)
+intel_gpgpu_load_vfe_state(intel_gpgpu_t *state)
 {
   BEGIN_BATCH(state->batch, 8);
   OUT_BATCH(state->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
@@ -171,23 +173,13 @@ gpgpu_load_vfe_state(intel_gpgpu_t *state)
     intel_batchbuffer_alloc_space(state->batch,0);
 
   memset(vfe, 0, sizeof(struct gen6_vfe_state_inline));
-  vfe->vfe1.gpgpu_mode =
-    (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) ? 1 : 0;
+  vfe->vfe1.gpgpu_mode = 1;
   vfe->vfe1.bypass_gateway_ctl = 1;
   vfe->vfe1.reset_gateway_timer = 1;
   vfe->vfe1.max_threads = state->max_threads - 1;
   vfe->vfe1.urb_entries = 64;
-  if (state->drv->gen_ver >= 7) {
-    vfe->vfe3.curbe_size = 480;
-    vfe->vfe4.scoreboard_mask = 0;
-  } else {
-    vfe->vfe3.curbe_size = 63;
-    vfe->vfe3.urb_size = 13;
-    vfe->vfe4.scoreboard_mask = 0x80000000;
-  }
-
-  //M
-  //p
+  vfe->vfe3.curbe_size = 480;
+  vfe->vfe4.scoreboard_mask = 0;
   //vfe->vfe3.urb_size = 13;
   //vfe->vfe4.scoreboard_mask = (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) ? 0 : 0x80000000;
   intel_batchbuffer_alloc_space(state->batch, sizeof(gen6_vfe_state_inline_t));
@@ -195,7 +187,7 @@ gpgpu_load_vfe_state(intel_gpgpu_t *state)
 }
 
 static void
-gpgpu_load_constant_buffer(intel_gpgpu_t *state) 
+intel_gpgpu_load_constant_buffer(intel_gpgpu_t *state) 
 {
   BEGIN_BATCH(state->batch, 4);
   OUT_BATCH(state->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
@@ -213,7 +205,7 @@ gpgpu_load_constant_buffer(intel_gpgpu_t *state)
 }
 
 static void
-gpgpu_load_idrt(intel_gpgpu_t *state) 
+intel_gpgpu_load_idrt(intel_gpgpu_t *state) 
 {
   BEGIN_BATCH(state->batch, 4);
   OUT_BATCH(state->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
@@ -311,7 +303,7 @@ enum GFX3DSTATE_PIPELINED_SUBOPCODE
 };
 
 static void
-gpgpu_pipe_control(intel_gpgpu_t *state)
+intel_gpgpu_pipe_control(intel_gpgpu_t *state)
 {
   BEGIN_BATCH(state->batch, SIZEOF32(gen6_pipe_control_t));
   gen6_pipe_control_t* pc = (gen6_pipe_control_t*)
@@ -328,7 +320,7 @@ gpgpu_pipe_control(intel_gpgpu_t *state)
   ADVANCE_BATCH(state->batch);
 }
 
-LOCAL void
+static void
 intel_gpgpu_set_L3(intel_gpgpu_t *state, uint32_t use_barrier)
 {
   BEGIN_BATCH(state->batch, 6);
@@ -347,21 +339,21 @@ intel_gpgpu_set_L3(intel_gpgpu_t *state, uint32_t use_barrier)
     OUT_BATCH(state->batch, gpgpu_l3_config_reg2[4]);
   ADVANCE_BATCH(state->batch);
 
-  gpgpu_pipe_control(state);
+  intel_gpgpu_pipe_control(state);
 }
 
-LOCAL void
-gpgpu_batch_start(intel_gpgpu_t *state)
+static void
+intel_gpgpu_batch_start(intel_gpgpu_t *state)
 {
   intel_batchbuffer_start_atomic(state->batch, 256);
-  gpgpu_pipe_control(state);
+  intel_gpgpu_pipe_control(state);
   if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
     intel_gpgpu_set_L3(state, state->ker->use_barrier);
-  gpgpu_select_pipeline(state);
-  gpgpu_set_base_address(state);
-  gpgpu_load_vfe_state(state);
-  gpgpu_load_constant_buffer(state);
-  gpgpu_load_idrt(state);
+  intel_gpgpu_select_pipeline(state);
+  intel_gpgpu_set_base_address(state);
+  intel_gpgpu_load_vfe_state(state);
+  intel_gpgpu_load_constant_buffer(state);
+  intel_gpgpu_load_idrt(state);
 
   if (state->perf_b.bo) {
     BEGIN_BATCH(state->batch, 3);
@@ -378,8 +370,8 @@ gpgpu_batch_start(intel_gpgpu_t *state)
   }
 }
 
-LOCAL void
-gpgpu_batch_end(intel_gpgpu_t *state, int32_t flush_mode)
+static void
+intel_gpgpu_batch_end(intel_gpgpu_t *state, int32_t flush_mode)
 {
   /* Insert the performance counter command */
   if (state->perf_b.bo) {
@@ -396,26 +388,26 @@ gpgpu_batch_end(intel_gpgpu_t *state, int32_t flush_mode)
     ADVANCE_BATCH(state->batch);
   }
 
-  if(flush_mode) gpgpu_pipe_control(state);
+  if(flush_mode) intel_gpgpu_pipe_control(state);
   intel_batchbuffer_end_atomic(state->batch);
 }
 
-LOCAL void
-gpgpu_batch_reset(intel_gpgpu_t *state, size_t sz)
+static void
+intel_gpgpu_batch_reset(intel_gpgpu_t *state, size_t sz)
 {
   intel_batchbuffer_reset(state->batch, sz);
 }
 
-LOCAL void
-gpgpu_flush(intel_gpgpu_t *state)
+static void
+intel_gpgpu_flush(intel_gpgpu_t *state)
 {
   intel_batchbuffer_flush(state->batch);
 }
 
-LOCAL void
-gpgpu_state_init(intel_gpgpu_t *state,
-                 uint32_t max_threads,
-                 uint32_t size_cs_entry)
+static void
+intel_gpgpu_state_init(intel_gpgpu_t *state,
+                       uint32_t max_threads,
+                       uint32_t size_cs_entry)
 {
   dri_bo *bo;
 
@@ -477,7 +469,7 @@ gpgpu_state_init(intel_gpgpu_t *state,
 }
 
 static void
-gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *state, int32_t index, dri_bo* obj_bo)
+intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *state, int32_t index, dri_bo* obj_bo)
 {
   surface_heap_t *heap = state->surface_heap_b.bo->virtual;
   heap->binding_table[index] = offsetof(surface_heap_t, surface) +
@@ -492,11 +484,11 @@ gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *state, int32_t index, dri_bo* obj_bo)
 }
 
 static void
-gpgpu_bind_buf_gen7(intel_gpgpu_t *state,
-                    int32_t index,
-                    dri_bo* obj_bo,
-                    uint32_t size,
-                    uint32_t cchint)
+intel_gpgpu_bind_buf_gen7(intel_gpgpu_t *state,
+                          int32_t index,
+                          dri_bo* obj_bo,
+                          uint32_t size,
+                          uint32_t cchint)
 {
   surface_heap_t *heap = state->surface_heap_b.bo->virtual;
   gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
@@ -509,18 +501,18 @@ gpgpu_bind_buf_gen7(intel_gpgpu_t *state,
   ss->ss2.height = (size_ss & 0x1fff80) >> 7;    /* bits 20:7 of size_ss */
   ss->ss3.depth  = (size_ss & 0xffe00000) >> 20; /* bits 27:21 of size_ss */
   ss->ss5.cache_control = cc_llc_l3;
-  gpgpu_set_buf_reloc_gen7(state, index, obj_bo);
+  intel_gpgpu_set_buf_reloc_gen7(state, index, obj_bo);
 }
 
 static void
-gpgpu_bind_image2D_gen7(intel_gpgpu_t *state,
-                        int32_t index,
-                        dri_bo* obj_bo,
-                        uint32_t format,
-                        int32_t w,
-                        int32_t h,
-                        int32_t pitch,
-                        int32_t tiling)
+intel_gpgpu_bind_image2D_gen7(intel_gpgpu_t *state,
+                              int32_t index,
+                              dri_bo* obj_bo,
+                              uint32_t format,
+                              int32_t w,
+                              int32_t h,
+                              int32_t pitch,
+                              int32_t tiling)
 {
   surface_heap_t *heap = state->surface_heap_b.bo->virtual;
   gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
@@ -539,48 +531,47 @@ gpgpu_bind_image2D_gen7(intel_gpgpu_t *state,
     ss->ss0.tiled_surface = 1;
     ss->ss0.tile_walk = I965_TILEWALK_YMAJOR;
   }
-  gpgpu_set_buf_reloc_gen7(state, index, obj_bo);
+  intel_gpgpu_set_buf_reloc_gen7(state, index, obj_bo);
 }
 
-LOCAL void
-gpgpu_bind_buf(intel_gpgpu_t *state,
-               int32_t index,
-               dri_bo* obj_bo,
-               uint32_t cchint)
+static void
+intel_gpgpu_bind_buf(intel_gpgpu_t *state,
+                     int32_t index,
+                     drm_intel_bo *obj_bo,
+                     uint32_t cchint)
 {
-  uint32_t size = obj_bo->size;
-
+  const uint32_t size = obj_bo->size;
   assert(index < MAX_SURFACES);
   if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
-    gpgpu_bind_buf_gen7(state, index, obj_bo, size, cchint);
+    intel_gpgpu_bind_buf_gen7(state, index, obj_bo, size, cchint);
   else
     NOT_IMPLEMENTED;
 }
 
-LOCAL void
-gpgpu_bind_image2D(intel_gpgpu_t *state,
-                   int32_t index,
-                   dri_bo* obj_bo,
-                   uint32_t format,
-                   int32_t w,
-                   int32_t h,
-                   int32_t pitch,
-                   gpgpu_tiling_t tiling)
+static void
+intel_gpgpu_bind_image2D(intel_gpgpu_t *state,
+                         int32_t index,
+                         cl_buffer *obj_bo,
+                         uint32_t format,
+                         int32_t w,
+                         int32_t h,
+                         int32_t pitch,
+                         cl_gpgpu_tiling_t tiling)
 {
   assert(index < MAX_SURFACES);
   if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
-    gpgpu_bind_image2D_gen7(state, index, obj_bo, format, w, h, pitch, tiling);
+    intel_gpgpu_bind_image2D_gen7(state, index, (drm_intel_bo*) obj_bo, format, w, h, pitch, tiling);
   else
     NOT_IMPLEMENTED;
 }
 
 static void
-gpgpu_build_idrt(intel_gpgpu_t *state,
-                 genx_gpgpu_kernel_t *kernel,
-                 uint32_t ker_n)
+intel_gpgpu_build_idrt(intel_gpgpu_t *state,
+                       cl_gpgpu_kernel_t *kernel,
+                       uint32_t ker_n)
 {
   gen6_interface_descriptor_t *desc;
-  dri_bo *bo;
+  drm_intel_bo *bo = NULL, *ker_bo = NULL;
   uint32_t i;
 
   bo = state->idrt_b.bo;
@@ -590,7 +581,8 @@ gpgpu_build_idrt(intel_gpgpu_t *state,
 
   for (i = 0; i < ker_n; i++) {
     memset(desc, 0, sizeof(*desc));
-    desc->desc0.kernel_start_pointer = kernel[i].bo->offset >> 6; /* reloc */
+    ker_bo = (drm_intel_bo *) kernel[i].bo;
+    desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
     desc->desc2.sampler_state_pointer = state->sampler_state_b.bo->offset >> 5;
     desc->desc3.binding_table_entry_count = 0; /* no prefetch */
     desc->desc3.binding_table_pointer = 0;
@@ -624,7 +616,7 @@ gpgpu_build_idrt(intel_gpgpu_t *state,
                       I915_GEM_DOMAIN_INSTRUCTION, 0,
                       0,
                       i * sizeof(*desc) + offsetof(gen6_interface_descriptor_t, desc0),
-                      kernel[i].bo);
+                      ker_bo);
 
     dri_bo_emit_reloc(bo,
                       I915_GEM_DOMAIN_INSTRUCTION, 0,
@@ -637,8 +629,8 @@ gpgpu_build_idrt(intel_gpgpu_t *state,
   dri_bo_unmap(bo);
 }
 
-LOCAL void
-gpgpu_upload_constants(intel_gpgpu_t *state, const void* data, uint32_t size)
+static void
+intel_gpgpu_upload_constants(intel_gpgpu_t *state, const void* data, uint32_t size)
 {
   unsigned char *constant_buffer = NULL;
 
@@ -649,8 +641,8 @@ gpgpu_upload_constants(intel_gpgpu_t *state, const void* data, uint32_t size)
   dri_bo_unmap(state->curbe_b.bo);
 }
 
-LOCAL void
-gpgpu_upload_samplers(intel_gpgpu_t *state, const void *data, uint32_t n)
+static void
+intel_gpgpu_upload_samplers(intel_gpgpu_t *state, const void *data, uint32_t n)
 {
   if (n) {
     /*sizeof(gen6_sampler_state_t) == sizeof(gen7_surface_state_t) */
@@ -659,17 +651,17 @@ gpgpu_upload_samplers(intel_gpgpu_t *state, const void *data, uint32_t n)
   }
 }
 
-LOCAL void
-gpgpu_states_setup(intel_gpgpu_t *state, genx_gpgpu_kernel_t *kernel, uint32_t ker_n)
+static void
+intel_gpgpu_states_setup(intel_gpgpu_t *state, cl_gpgpu_kernel_t *kernel, uint32_t ker_n)
 {
   state->ker = kernel;
-  gpgpu_build_idrt(state, kernel, ker_n);
+  intel_gpgpu_build_idrt(state, kernel, ker_n);
   dri_bo_unmap(state->surface_heap_b.bo);
   dri_bo_unmap(state->sampler_state_b.bo);
 }
 
-LOCAL void 
-gpgpu_update_barrier(intel_gpgpu_t *state, uint32_t barrierID, uint32_t thread_n)
+static void 
+intel_gpgpu_update_barrier(intel_gpgpu_t *state, uint32_t barrierID, uint32_t thread_n)
 {
   BEGIN_BATCH(state->batch, 4);
   OUT_BATCH(state->batch, CMD_MEDIA_STATE_FLUSH | 0);
@@ -679,52 +671,22 @@ gpgpu_update_barrier(intel_gpgpu_t *state, uint32_t barrierID, uint32_t thread_n
   ADVANCE_BATCH(state->batch);
 }
 
-LOCAL void
-gpgpu_set_perf_counters(intel_gpgpu_t *state, dri_bo *perf)
+static void
+intel_gpgpu_set_perf_counters(intel_gpgpu_t *state, cl_buffer *perf)
 {
   if (state->perf_b.bo)
     drm_intel_bo_unreference(state->perf_b.bo);
-  drm_intel_bo_reference(perf);
-  state->perf_b.bo = perf;
-}
-
-LOCAL void
-gpgpu_run(intel_gpgpu_t *state, int32_t ki)
-{
-  BEGIN_BATCH(state->batch, 6);
-  OUT_BATCH(state->batch, GEN_CMD_MEDIA_OBJECT | 5);
-  OUT_BATCH(state->batch, ki); /* kernel index + DBGOBJ=0 << 8 */
-  OUT_BATCH(state->batch, 0);  /* no children, no thread sync, no scoreboard, any half-slice, indirect data size = 0 */
-  OUT_BATCH(state->batch, 0);  /* no indirect data ptr */
-  OUT_BATCH(state->batch, 0);  /* scoreboard Y,X = 0 or just 0 in reg.0 */
-  OUT_BATCH(state->batch, 0);  /* scoreboard color, mask = 0  or just 0 in reg.1 */
-  OUT_BATCH(state->batch, 0);  /* scoreboard color, mask = 0  or just 0 in reg.1 */
-  ADVANCE_BATCH(state->batch);
-}
-
-LOCAL char*
-gpgpu_run_with_inline(intel_gpgpu_t *state, int32_t ki, size_t sz)
-{
-  const uint32_t len = (uint32_t) (sz >> 2);
-
-  assert(sz % sizeof(int32_t) == 0);
-  BEGIN_BATCH(state->batch, len + 6);
-  OUT_BATCH(state->batch, GEN_CMD_MEDIA_OBJECT | (len + 4));
-  OUT_BATCH(state->batch, ki); /* kernel index + DBGOBJ=0 << 8 */
-  OUT_BATCH(state->batch, 0);  /* no children, no thread sync, no scoreboard, any half-slice, indirect data size = 0 */
-  OUT_BATCH(state->batch, 0);  /* no indirect data ptr */
-  OUT_BATCH(state->batch, 0);  /* scoreboard Y,X = 0 or just 0 in reg.0 */
-  OUT_BATCH(state->batch, 0);  /* scoreboard color, mask = 0  or just 0 in reg.1 */
-  return (char*) intel_batchbuffer_alloc_space(state->batch,sz);
+  drm_intel_bo_reference((drm_intel_bo*) perf);
+  state->perf_b.bo = (drm_intel_bo*) perf;
 }
 
-LOCAL void
-gpgpu_walker(intel_gpgpu_t *state,
-             uint32_t simd_sz,
-             uint32_t thread_n,
-             const size_t global_wk_off[3],
-             const size_t global_wk_sz[3],
-             const size_t local_wk_sz[3])
+static void
+intel_gpgpu_walker(intel_gpgpu_t *state,
+                   uint32_t simd_sz,
+                   uint32_t thread_n,
+                   const size_t global_wk_off[3],
+                   const size_t global_wk_sz[3],
+                   const size_t local_wk_sz[3])
 {
   const uint32_t global_wk_dim[3] = {
     global_wk_sz[0] / local_wk_sz[0],
@@ -755,17 +717,23 @@ gpgpu_walker(intel_gpgpu_t *state,
   ADVANCE_BATCH(state->batch);
 }
 
-LOCAL int32_t
-intel_gpgpu_version(intel_gpgpu_t *gpgpu)
-{
-  return gpgpu->drv->gen_ver;
+LOCAL void
+intel_set_gpgpu_callbacks(void)
+{
+  cl_gpgpu_new = (cl_gpgpu_new_cb *) intel_gpgpu_new;
+  cl_gpgpu_delete = (cl_gpgpu_delete_cb *) intel_gpgpu_delete;
+  cl_gpgpu_bind_image2D = (cl_gpgpu_bind_image2D_cb *) intel_gpgpu_bind_image2D;
+  cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *) intel_gpgpu_bind_buf;
+  cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
+  cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
+  cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) intel_gpgpu_upload_constants;
+  cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
+  cl_gpgpu_update_barrier = (cl_gpgpu_update_barrier_cb *) intel_gpgpu_update_barrier;
+  cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
+  cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset;
+  cl_gpgpu_batch_start = (cl_gpgpu_batch_start_cb *) intel_gpgpu_batch_start;
+  cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end;
+  cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
+  cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
 }
 
-uint32_t drm_intel_bo_get_size(drm_intel_bo *bo)
-{
-    return (bo->size);
-}
-void *drm_intel_bo_get_virtual(drm_intel_bo *bo)
-{
-    return (bo->virtual);
-}
diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h
index e3b8dfd..006a55b 100644
--- a/src/intel/intel_gpgpu.h
+++ b/src/intel/intel_gpgpu.h
@@ -21,126 +21,13 @@
 #define __GENX_GPGPU_H__
 
 #include "cl_utils.h"
-#include "intel_bufmgr.h"
+#include "cl_driver.h"
 
 #include <stdlib.h>
 #include <stdint.h>
 
-#define MAX_SURFACES 128
-#define MAX_SAMPLERS 16
-
-enum gen6_cache_control {
-  cc_gtt      = 0x0,
-  cc_l3       = 0x1,
-  cc_llc      = 0x2,
-  cc_llc_l3   = 0x3
-};
-
-/* Use this structure to bind kernels in the gpgpu state */
-typedef struct genx_gpgpu_kernel {
-  const char *name;        /* kernel name and bo name */
-  uint32_t grf_blocks;     /* register blocks kernel wants (in 8 reg blocks) */
-  uint32_t cst_sz;         /* total size of all constants */
-  struct _drm_intel_bo *bo;/* kernel code in the proper addr space */
-  int32_t barrierID;       /* barrierID for _this_ kernel */
-  uint32_t use_barrier:1;  /* For gen7 (automatic barrier management) */
-  uint32_t thread_n:15;    /* For gen7 (automatic barrier management) */
-  uint32_t slm_sz:16;      /* For gen7 (automatic SLM allocation) */
-} genx_gpgpu_kernel_t;
-
-/* Convenient abstraction of the device */
-struct intel_driver;
-
-/* Covenient way to talk to the device */
-typedef struct intel_gpgpu intel_gpgpu_t;
-
-/* Buffer object as exposed by drm_intel */
-struct _drm_intel_bo;
-
-/* Allocate and initialize a GPGPU state */
-extern intel_gpgpu_t* intel_gpgpu_new(struct intel_driver*);
-
-/* Destroy and deallocate a GPGPU state */
-extern void intel_gpgpu_delete(intel_gpgpu_t*);
-
-/* Get the device generation */
-extern int32_t intel_gpgpu_version(intel_gpgpu_t*);
-
-/* Buffer management wrapper APIs */
-extern uint32_t drm_intel_bo_get_size(drm_intel_bo *bo);
-extern void *drm_intel_bo_get_virtual(drm_intel_bo *bo);
-
-/* Set typeless buffer descriptor in the current binding table */
-extern void gpgpu_bind_buf(intel_gpgpu_t*,
-                           int32_t index,
-                           struct _drm_intel_bo* obj_bo,
-                           uint32_t cchint);
-
-typedef enum gpgpu_tiling {
-  GPGPU_NO_TILE = 0,
-  GPGPU_TILE_X  = 1,
-  GPGPU_TILE_Y  = 2,
-} gpgpu_tiling_t;
-
-/* Set a 2d texture */
-extern void gpgpu_bind_image2D(intel_gpgpu_t *state,
-                               int32_t index,
-                               struct _drm_intel_bo* obj_bo,
-                               uint32_t format,
-                               int32_t w,
-                               int32_t h,
-                               int pitch,
-                               gpgpu_tiling_t tiling);
-
-/* Configure state, size in 512-bit units */
-extern void gpgpu_state_init(intel_gpgpu_t*, uint32_t max_threads, uint32_t size_cs_entry);
-
-/* Set the buffer object where to report performance counters */
-extern void gpgpu_set_perf_counters(intel_gpgpu_t*, struct _drm_intel_bo *perf);
-
-/* Fills current constant buffer with data */
-extern void gpgpu_upload_constants(intel_gpgpu_t*, const void* data, uint32_t size);
-
-/* Setup all indirect states */
-extern void gpgpu_states_setup(intel_gpgpu_t*, genx_gpgpu_kernel_t* kernel, uint32_t ker_n);
-
-/* Make HW threads use barrierID */
-extern void gpgpu_update_barrier(intel_gpgpu_t*, uint32_t barrierID, uint32_t thread_n);
-
-/* Upload the constant samplers as specified inside the OCL kernel */
-extern void gpgpu_upload_samplers(intel_gpgpu_t *state, const void *data, uint32_t n);
-
-/* Set a sampler */
-extern void gpgpu_set_sampler(intel_gpgpu_t*, uint32_t index, uint32_t non_normalized);
-
-/* Allocate the batch buffer and return the BO used for the batch buffer */
-extern void gpgpu_batch_reset(intel_gpgpu_t*, size_t sz);
-
-/* Atomic begin, pipeline select, urb, pipeline state and constant buffer */
-extern void gpgpu_batch_start(intel_gpgpu_t*);
-
-/* atomic end with possibly inserted flush */
-extern void gpgpu_batch_end(intel_gpgpu_t*, int32_t flush_mode);
-
-/* Emit MI_FLUSH */
-extern void gpgpu_flush(intel_gpgpu_t*);
-
-/* Enqueue a MEDIA object with no inline data */
-extern void gpgpu_run(intel_gpgpu_t*, int32_t ki);
-
-/* Enqueue a MEDIA object with inline data to push afterward. Returns the
- * pointer where to push. sz is the size of the data we are going to pass
- */
-extern char* gpgpu_run_with_inline(intel_gpgpu_t*, int32_t ki, size_t sz);
-
-/* Will spawn all threads */
-extern void
-gpgpu_walker(intel_gpgpu_t *state,
-             uint32_t simd_sz,
-             uint32_t thread_n,
-             const size_t global_wk_off[3],
-             const size_t global_wk_sz[3],
-             const size_t local_wk_sz[3]);
+/* Set the gpgpu related call backs */
+extern void intel_set_gpgpu_callbacks(void);
 
 #endif /* __GENX_GPGPU_H__ */
 
diff --git a/src/sim/sim_buffer.c b/src/sim/sim_driver.c
similarity index 94%
rename from src/sim/sim_buffer.c
rename to src/sim/sim_driver.c
index 820c770..5e5cfad 100644
--- a/src/sim/sim_buffer.c
+++ b/src/sim/sim_driver.c
@@ -19,13 +19,13 @@
 
 #include "cl_utils.h"
 #include "cl_alloc.h"
-#include "sim/sim_buffer.h"
+#include "sim/sim_driver.h"
 #include "CL/cl.h"
 #include <stdlib.h>
 #include <assert.h>
 #include <string.h>
 
-#include "cl_buffer.h"
+#include "cl_driver.h"
 
 /* Just to count allocations */
 typedef struct sim_bufmgr { volatile int buf_n; } sim_bufmgr_t;
@@ -81,13 +81,6 @@ sim_buffer_unreference(sim_buffer_t *buf)
   sim_buffer_delete(buf);
 }
 
-static void*
-sim_buffer_map(sim_buffer_t *buf)
-{
-  assert(buf);
-  return buf->data;
-}
-
 static int
 sim_buffer_subdata(sim_buffer_t *buf, unsigned long offset, unsigned long size, const void *data)
 {
@@ -107,8 +100,10 @@ sim_buffer_emit_reloc(sim_buffer_t *buf,
 {
   return 1;
 }
+
+static int sim_buffer_map(sim_buffer_t *buf, uint32_t write_enable) {return 0;}
 static int sim_buffer_unmap(sim_buffer_t *buf) {return 0;}
-static int sim_buffer_pin(sim_buffer_t *buf) {return 0;}
+static int sim_buffer_pin(sim_buffer_t *buf, uint32_t alignment) {return 0;}
 static int sim_buffer_unpin(sim_buffer_t *buf) {return 0;}
 
 LOCAL void
diff --git a/src/sim/sim_buffer.h b/src/sim/sim_driver.h
similarity index 91%
rename from src/sim/sim_buffer.h
rename to src/sim/sim_driver.h
index 230d89c..f80f234 100644
--- a/src/sim/sim_buffer.h
+++ b/src/sim/sim_driver.h
@@ -17,11 +17,12 @@
  * Author: Benjamin Segovia <benjamin.segovia@intel.com>
  */
 
-#ifndef __SIM_BUFFER_H__
-#define __SIM_BUFFER_H__
+#ifndef __SIM_DRIVER_H__
+#define __SIM_DRIVER_H__
 
 /* Setup the call backs when using the simulator */
 extern void sim_setup_callbacks(void);
 
-#endif /* __SIM_BUFFER_H__ */
+#endif /* __SIM_DRIVER_H__ */
+
 
-- 
2.7.4