From f98a8cfb7eec88369ca7059b4a338d33d9bdd003 Mon Sep 17 00:00:00 2001
From: Benjamin Segovia <segovia.benjamin@gmail.com>
Date: Sun, 8 Apr 2012 22:51:40 -0700
Subject: [PATCH] Made most of the very basic pieces for the simulation parts

---
 kernels/compile.sh              |   6 ++
 kernels/stdlib.h                | 114 +++++++++++++++++++++++++++++++++++++
 kernels/test_write_only_2.cl    |   8 +++
 kernels/test_write_only_2.cl.ll |  28 +++++++++
 src/CMakeLists.txt              |   5 ++
 src/cl_api.c                    |  10 ++--
 src/cl_command_queue.c          |  28 ++++-----
 src/cl_command_queue_gen7.c     |   2 +-
 src/cl_kernel.c                 |  62 +++++++++++++++++++-
 src/cl_kernel.h                 |  32 +++++++----
 src/sim/sim_driver.c            | 123 +++++++++++++++++++++++++---------------
 src/sim/sim_simulator.c         |  69 ++++++++++++++++++++++
 src/sim/sim_simulator.h         |  31 ++++++++++
 13 files changed, 434 insertions(+), 84 deletions(-)
 create mode 100755 kernels/compile.sh
 create mode 100644 kernels/stdlib.h
 create mode 100644 kernels/test_write_only_2.cl
 create mode 100644 kernels/test_write_only_2.cl.ll
 create mode 100644 src/sim/sim_simulator.c
 create mode 100644 src/sim/sim_simulator.h

diff --git a/kernels/compile.sh b/kernels/compile.sh
new file mode 100755
index 0000000..e1177a7
--- /dev/null
+++ b/kernels/compile.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+clang -emit-llvm -O3 -ccc-host-triple ptx32 -c $1 -o $1.o
+llvm-dis $1.o
+rm $1.o
+mv $1.o.ll $1.ll
+
diff --git a/kernels/stdlib.h b/kernels/stdlib.h
new file mode 100644
index 0000000..472655a
--- /dev/null
+++ b/kernels/stdlib.h
@@ -0,0 +1,114 @@
+/* 
+ * Copyright Â© 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#define DECL_INTERNAL_WORK_ITEM_FN(NAME)                             \
+__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void);  \
+__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void);  \
+__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##2(void);
+DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
+DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
+DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
+DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
+DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
+#undef DECL_INTERNAL_WORK_ITEM_FN
+
+#define DECL_PUBLIC_WORK_ITEM_FN(NAME)              \
+inline unsigned NAME(unsigned int dim) {            \
+  if (dim == 0) return __gen_ocl_##NAME##0();       \
+  else if (dim == 1) return __gen_ocl_##NAME##1();  \
+  else if (dim == 2) return __gen_ocl_##NAME##2();  \
+  else return 0;                                    \
+}
+DECL_PUBLIC_WORK_ITEM_FN(get_group_id)
+DECL_PUBLIC_WORK_ITEM_FN(get_local_id)
+DECL_PUBLIC_WORK_ITEM_FN(get_local_size)
+DECL_PUBLIC_WORK_ITEM_FN(get_global_size)
+DECL_PUBLIC_WORK_ITEM_FN(get_num_groups)
+#undef DECL_PUBLIC_WORK_ITEM_FN
+
+inline unsigned int get_global_id(unsigned int dim) {
+  return get_local_id(dim) + get_local_size(dim) * get_num_groups(dim);
+}
+
+__attribute__ ((pure,const,overloadable)) float mad(float a, float b, float c);
+__attribute__((overloadable)) inline unsigned select(unsigned src0, unsigned src1, unsigned cond) {
+  return cond ? src0 : src1;
+}
+__attribute__((overloadable)) inline int select(int src0, int src1, int cond) {
+  return cond ? src0 : src1;
+}
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef int uint2 __attribute__((ext_vector_type(2)));
+typedef unsigned uint3 __attribute__((ext_vector_type(3)));
+typedef unsigned uint4 __attribute__((ext_vector_type(4)));
+typedef bool bool2 __attribute__((ext_vector_type(2)));
+typedef bool bool3 __attribute__((ext_vector_type(3)));
+typedef bool bool4 __attribute__((ext_vector_type(4)));
+
+// This will be optimized out by LLVM and will output LLVM select instructions
+#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK)                       \
+__attribute__((overloadable))                                             \
+inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) {            \
+  TYPE4 dst;                                                              \
+  const TYPE x0 = src0.x; /* Fix performance issue with CLANG */          \
+  const TYPE x1 = src1.x;                                                 \
+  const TYPE y0 = src0.y;                                                 \
+  const TYPE y1 = src1.y;                                                 \
+  const TYPE z0 = src0.z;                                                 \
+  const TYPE z1 = src1.z;                                                 \
+  const TYPE w0 = src0.w;                                                 \
+  const TYPE w1 = src1.w;                                                 \
+                                                                          \
+  dst.x = (cond.x & MASK) ? x1 : x0;                                      \
+  dst.y = (cond.y & MASK) ? y1 : y0;                                      \
+  dst.z = (cond.z & MASK) ? z1 : z0;                                      \
+  dst.w = (cond.w & MASK) ? w1 : w0;                                      \
+  return dst;                                                             \
+}
+DECL_SELECT4(int4, int, int4, 0x80000000)
+DECL_SELECT4(float4, float, int4, 0x80000000)
+#undef DECL_SELECT4
+
+__attribute__((overloadable,always_inline)) inline float2 mad(float2 a, float2 b, float2 c) {
+  return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y));
+}
+__attribute__((overloadable,always_inline)) inline float3 mad(float3 a, float3 b, float3 c) {
+  return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z));
+}
+__attribute__((overloadable,always_inline)) inline float4 mad(float4 a, float4 b, float4 c) {
+  return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y),
+                  mad(a.z,b.z,c.z), mad(a.w,b.w,c.w));
+}
+
+#define __private __attribute__((address_space(0)))
+#define __global __attribute__((address_space(1)))
+#define __constant __attribute__((address_space(2)))
+//#define __local __attribute__((address_space(3)))
+#define global __global
+//#define local __local
+#define constant __constant
+#define private __private
+
+#define NULL ((void*)0)
diff --git a/kernels/test_write_only_2.cl b/kernels/test_write_only_2.cl
new file mode 100644
index 0000000..8edd39a
--- /dev/null
+++ b/kernels/test_write_only_2.cl
@@ -0,0 +1,8 @@
+#include "stdlib.h"
+__kernel void
+test_write_only(__global float* dst )
+{
+    int id = (int)get_global_id(0);
+    dst[id] = 1;
+}
+
diff --git a/kernels/test_write_only_2.cl.ll b/kernels/test_write_only_2.cl.ll
new file mode 100644
index 0000000..3d7e493
--- /dev/null
+++ b/kernels/test_write_only_2.cl.ll
@@ -0,0 +1,28 @@
+; ModuleID = 'test_write_only_2.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_kernel void @test_write_only(float addrspace(1)* nocapture %dst) nounwind noinline {
+get_global_id.exit:
+  %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+  %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+  %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+  %mul.i = mul i32 %call.i10.i, %call.i3.i
+  %add.i = add i32 %mul.i, %call.i.i
+  %arrayidx = getelementptr inbounds float addrspace(1)* %dst, i32 %add.i
+  store float 1.000000e+00, float addrspace(1)* %arrayidx, align 4, !tbaa !1
+  ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (float addrspace(1)*)* @test_write_only}
+!1 = metadata !{metadata !"float", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9e44bc7..035e941 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -17,10 +17,15 @@ SET(OPENCL_SRC
     cl_device_id.c
     cl_context.c
     cl_command_queue.c
+    cl_command_queue.h
     cl_command_queue_gen7.c
     cl_driver.c
+    cl_driver.h
     cl_driver.cpp
     sim/sim_driver.c
+    sim/sim_driver.h
+    sim/sim_simulator.c
+    sim/sim_simulator.h
     intel/intel_gpgpu.c
     intel/intel_batchbuffer.c
     intel/intel_driver.c
diff --git a/src/cl_api.c b/src/cl_api.c
index 2271be9..aa7b664 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -621,11 +621,11 @@ clGetKernelWorkGroupInfo(cl_kernel                   kernel,
                          void *                      param_value,
                          size_t *                    param_value_size_ret)
 {
-  return cl_get_kernel_workgroup_info(device, 
-					param_name, 
-					param_value_size, 
-					param_value, 
-					param_value_size_ret);
+  return cl_get_kernel_workgroup_info(device,
+                                      param_name,
+                                      param_value_size,
+                                      param_value,
+                                      param_value_size_ret);
 }
 
 cl_int
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 79f67a9..06eada9 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -105,26 +105,18 @@ cl_command_queue_bind_surface(cl_command_queue queue,
                               cl_buffer *scratch,
                               uint32_t local_sz)
 {
-#if 0
-  cl_context ctx = queue->ctx;
-  cl_gpgpu gpgpu = queue->gpgpu;
-  cl_buffer_mgr bufmgr = cl_context_get_bufmgr(ctx);
-  cl_buffer sync_bo = NULL;
-  cl_int err = CL_SUCCESS;
-
-  /* Now bind a bo used for synchronization */
-  sync_bo = cl_buffer_alloc(bufmgr, "sync surface", 64, 64);
-  // cl_gpgpu_bind_buf(gpgpu, GEN_MAX_SURFACES-1, sync_bo, cc_llc_l3);
-  if (queue->last_batch != NULL)
-    cl_buffer_unreference(queue->last_batch);
-  queue->last_batch = sync_bo;
+  /* Bind all user buffers (given by clSetKernelArg) */
+  uint32_t i;
+  for (i = 0; i < k->arg_n; ++k) {
+    uint32_t offset; // location of the address in the curbe
+    if (gbe_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR &&
+        gbe_kernel_get_arg_type(k->opaque, i) != GBE_ARG_CONSTANT_PTR)
+      continue;
+    offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
+    cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, cc_llc_l3);
+  }
 
-// error:
-  assert(err == CL_SUCCESS); /* Cannot fail here */
-  return err;
-#else
   return CL_SUCCESS;
-#endif
 }
 
 #if USE_FULSIM
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 70a1827..d35bf7d 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -87,7 +87,7 @@ cl_curbe_fill(cl_kernel ker,
               const size_t *global_wk_sz,
               const size_t *local_wk_sz)
 {
-  uint32_t offset;
+  int32_t offset;
 #define UPLOAD(ENUM, VALUE)                                              \
   if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, ENUM, 0)) >= 0) \
     *((uint32_t *) (curbe + offset)) = VALUE;
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 5b9f8bc..36064a4 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -35,6 +35,7 @@
 LOCAL void
 cl_kernel_delete(cl_kernel k)
 {
+  uint32_t i;
   if (k == NULL) return;
 
   /* We are not done with the kernel */
@@ -46,6 +47,13 @@ cl_kernel_delete(cl_kernel k)
   if (k->ref_its_program) cl_program_delete(k->program);
   /* Release the curbe if allocated */
   if (k->curbe) cl_free(k->curbe);
+  /* Release the argument array if required */
+  if (k->args) {
+    for (i = 0; i < k->arg_n; ++i)
+      if (k->args[i].mem != NULL)
+        cl_mem_delete(k->args[i].mem);
+    cl_free(k->args);
+  }
   k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(k);
 }
@@ -83,9 +91,56 @@ cl_kernel_add_ref(cl_kernel k)
 LOCAL cl_int
 cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
 {
-  cl_int err = CL_SUCCESS;
+  uint32_t offset;            /* where to patch */
+  enum gbe_arg_type arg_type; /* kind of argument */
+  size_t arg_sz;              /* size of the argument */
+  cl_mem mem;                 /* for __global, __constant and image arguments */
+
+  if (UNLIKELY(index >= k->arg_n))
+    return CL_INVALID_ARG_INDEX;
+  arg_type = gbe_kernel_get_arg_type(k->opaque, index);
+  arg_sz = gbe_kernel_get_arg_size(k->opaque, index);
+  if (UNLIKELY(arg_sz != sz))
+    return CL_INVALID_ARG_SIZE;
+
+  /* Copy the structure or the value directly into the curbe */
+  if (arg_type == GBE_ARG_VALUE) {
+    if (UNLIKELY(value == NULL))
+      return CL_INVALID_KERNEL_ARGS;
+    offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+    assert(offset + sz <= k->curbe_sz);
+    memcpy(k->curbe + offset, value, sz);
+    k->args[index].local_sz = 0;
+    k->args[index].is_set = 1;
+    k->args[index].mem = NULL;
+    return CL_SUCCESS;
+  }
 
-  return err;
+  /* For a local pointer just save the size */
+  if (arg_type == GBE_ARG_LOCAL_PTR) {
+    if (UNLIKELY(value != NULL))
+      return CL_INVALID_KERNEL_ARGS;
+    k->args[index].local_sz = sz;
+    k->args[index].is_set = 1;
+    k->args[index].mem = NULL;
+    return CL_SUCCESS;
+  }
+
+  /* Otherwise, we just need to check that this is a buffer */
+  if (UNLIKELY(value == NULL))
+    return CL_INVALID_KERNEL_ARGS;
+  mem = *(cl_mem*) value;
+  if (UNLIKELY(mem->magic != CL_MAGIC_MEM_HEADER))
+    return CL_INVALID_ARG_VALUE;
+  if (mem->is_image)
+    if (UNLIKELY(arg_type == GBE_ARG_IMAGE))
+      return CL_INVALID_ARG_VALUE;
+  cl_mem_add_ref(mem);
+  k->args[index].mem = mem;
+  k->args[index].is_set = 1;
+  k->args[index].local_sz = 0;
+
+  return CL_SUCCESS;
 }
 
 LOCAL uint32_t
@@ -105,6 +160,7 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
   const uint32_t code_sz = gbe_kernel_get_code_size(opaque);
   const char *code = gbe_kernel_get_code(opaque);
   k->bo = cl_buffer_alloc(bufmgr, "CL kernel", code_sz, 64u);
+  k->arg_n = gbe_kernel_get_arg_num(opaque);
 
   /* Upload the code */
   cl_buffer_subdata(k->bo, 0, code_sz, code);
@@ -137,6 +193,8 @@ cl_kernel_dup(cl_kernel from)
   to->ref_n = 1;
   to->magic = CL_MAGIC_KERNEL_HEADER;
   to->program = from->program;
+  to->arg_n = from->arg_n;
+  TRY_ALLOC_NO_ERR(to->args, cl_calloc(to->arg_n, sizeof(cl_argument)));
 
   /* Retain the bos */
   if (from->bo)       cl_buffer_reference(from->bo);
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 545c268..1906cec 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -32,18 +32,28 @@
 /* This is the kernel as it is interfaced by the compiler */
 struct _gbe_kernel;
 
-/*! One OCL function */
+/* We need to save buffer data for relocation and binding and we must figure out
+ * if all arguments are properly set
+ */
+typedef struct cl_argument {
+  cl_mem mem;           /* For image and regular buffers */
+  uint32_t local_sz:31; /* For __local size specification */
+  uint32_t is_set:1;    /* All args must be set before NDRange */
+} cl_argument;
+
+/* One OCL function */
 struct _cl_kernel {
-  uint64_t magic;           /* To identify it as a kernel */
-  volatile int ref_n;       /* We reference count this object */
-  cl_buffer bo;             /* The code itself */
-  cl_buffer const_bo;       /* Buffer for all __constants values in the OCL program */
-  cl_program program;       /* Owns this structure (and pointers) */
-  gbe_kernel opaque;        /* (Opaque) compiler structure for the OCL kernel */
-  char *curbe;              /* One curbe per kernel */
-  size_t curbe_sz;          /* Size of it */
-  
-  uint8_t ref_its_program;  /* True only for the user kernel (created by clCreateKernel) */
+  uint64_t magic;             /* To identify it as a kernel */
+  volatile int ref_n;         /* We reference count this object */
+  cl_buffer bo;               /* The code itself */
+  cl_buffer const_bo;         /* Buffer for all __constants values in the OCL program */
+  cl_program program;         /* Owns this structure (and pointers) */
+  gbe_kernel opaque;          /* (Opaque) compiler structure for the OCL kernel */
+  char *curbe;                /* One curbe per kernel */
+  size_t curbe_sz;            /* Size of it */
+  cl_argument *args;          /* To track argument setting */
+  uint32_t arg_n:31;          /* Number of arguments */
+  uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
 };
 
 /* Allocate an empty kernel */
diff --git a/src/sim/sim_driver.c b/src/sim/sim_driver.c
index aa6c7ac..3fa224c 100644
--- a/src/sim/sim_driver.c
+++ b/src/sim/sim_driver.c
@@ -22,12 +22,16 @@
 #include "cl_device_data.h"
 #include "sim/sim_driver.h"
 #include "CL/cl.h"
+#include "cl_driver.h"
+#include "gen/program.h"
+#include "gen/simulator.h"
+#include "sim/sim_simulator.h"
+
 #include <stdlib.h>
 #include <assert.h>
 #include <string.h>
 #include <stdio.h>
 
-#include "cl_driver.h"
 
 /* Fake buffer manager that just counts allocations */
 struct _sim_bufmgr { volatile int buf_n; };
@@ -39,11 +43,7 @@ sim_bufmgr_new(void)
   return cl_calloc(1,sizeof(struct _sim_bufmgr));
 }
 
-static void
-sim_bufmgr_delete(sim_bufmgr bufmgr)
-{
-  cl_free(bufmgr);
-}
+static void sim_bufmgr_delete(sim_bufmgr bufmgr) { cl_free(bufmgr); }
 
 /* Fake low-level driver */
 struct _sim_driver {
@@ -91,7 +91,8 @@ sim_driver_get_bufmgr(sim_driver driver)
 static int
 sim_driver_get_device_id(void)
 {
-  return PCI_CHIP_IVYBRIDGE_GT2; // XXX get some env variable instead
+  /* XXX get some env variable instead */
+  return PCI_CHIP_IVYBRIDGE_GT2;
 }
 
 /* Just a named buffer to mirror real drm functions */
@@ -184,7 +185,7 @@ static int sim_buffer_unpin(sim_buffer buf) {return 0;}
 static int sim_buffer_wait_rendering(sim_buffer buf) {return 0;}
 
 /* Function to call for each HW thread we simulate */
-typedef void (sim_kernel_cb)(void);
+typedef void (sim_kernel_cb)(gbe_simulator, uint32_t, uint32_t, uint32_t, uint32_t);
 
 /* We can bind only a limited number of buffers */
 enum { max_buf_n = 128 };
@@ -192,19 +193,23 @@ enum { max_buf_n = 128 };
 /* Encapsulates operations needed to run one NDrange */
 struct _sim_gpgpu
 {
-  sim_driver driver;     /* the driver the gpgpu states belongs to */
-  sim_kernel_cb *kernel; /* call it for each HW thread */
+  sim_driver driver;                 /* the driver the gpgpu states belongs to */
+  sim_kernel_cb *kernel;             /* call it for each HW thread */
   sim_buffer binded_buf[max_buf_n];  /* all buffers binded for the call */
   char *fake_memory;                 /* fake memory to emulate flat address space in any mode (32 / 64 bits) */
+  char *curbe;                       /* constant buffer */
   uint32_t binded_offset[max_buf_n]; /* their offsets in the constant buffer */
+  uint32_t memory_remap[max_buf_n];  /* offset of each buffer in the fake memory space */
   uint32_t max_threads;              /* HW threads running */
   uint32_t cst_sz;                   /* size of the constant buffer */
   uint32_t binded_n;                 /* number of buffers binded */
+  uint32_t thread_n;                 /* number of threads to run per work group */
 };
 typedef struct _sim_gpgpu *sim_gpgpu;
 
-static void sim_gpgpu_delete(sim_gpgpu gpgpu)
-{
+static void sim_gpgpu_delete(sim_gpgpu gpgpu) {
+  if (gpgpu->fake_memory) cl_free(gpgpu->fake_memory);
+  if (gpgpu->curbe) cl_free(gpgpu->curbe);
   cl_free(gpgpu);
 }
 
@@ -212,7 +217,6 @@ static sim_gpgpu sim_gpgpu_new(sim_driver driver)
 {
   sim_gpgpu gpgpu = NULL;
   TRY_ALLOC_NO_ERR(gpgpu, cl_calloc(1, sizeof(struct _sim_gpgpu)));
-
 exit:
   return gpgpu;
 error:
@@ -223,21 +227,40 @@ error:
 
 static void sim_gpgpu_bind_image2D(sim_gpgpu gpgpu,
                                    int32_t index,
-                                   sim_buffer obj_bo,
+                                   sim_buffer bo,
                                    uint32_t format,
                                    int32_t w,
                                    int32_t h,
                                    int pitch,
                                    cl_gpgpu_tiling tiling) {}
 static void sim_gpgpu_set_perf_counters(sim_gpgpu gpgpu, sim_buffer perf) {}
-static void sim_gpgpu_upload_constants(sim_gpgpu gpgpu, const void* data, uint32_t size) {}
 static void sim_gpgpu_upload_samplers(sim_gpgpu gpgpu, const void *data, uint32_t n) {}
 static void sim_gpgpu_batch_reset(sim_gpgpu gpgpu, size_t sz) {}
 static void sim_gpgpu_batch_start(sim_gpgpu gpgpu) {}
 static void sim_gpgpu_batch_end(sim_gpgpu gpgpu, uint32_t flush_mode) {}
 static void sim_gpgpu_flush(sim_gpgpu gpgpu) {}
 
-static void sim_gpgpu_state_init(sim_gpgpu gpgpu, uint32_t max_threads, uint32_t size_cs_entry)
+static void
+sim_gpgpu_upload_constants(sim_gpgpu gpgpu, const void* data, uint32_t size)
+{
+  uint32_t i, j;
+  assert(size == gpgpu->cst_sz * gpgpu->thread_n);
+  if (gpgpu->curbe) cl_free(gpgpu->curbe);
+  gpgpu->curbe = (char*) cl_malloc(size);
+
+  /* Upload the buffer offsets per thread */
+  for (i = 0; i < gpgpu->thread_n; ++i) {
+    const uint32_t start_offset = i * gpgpu->cst_sz;
+    for (j = 0; j < gpgpu->binded_n; ++j) {
+      const uint32_t offset = start_offset + gpgpu->binded_offset[j];
+      const uint32_t fake_address = gpgpu->memory_remap[j];
+      *(uint32_t*) (gpgpu->curbe + offset) = fake_address; /* XXX 32 bits only */
+    }
+  }
+}
+
+static void
+sim_gpgpu_state_init(sim_gpgpu gpgpu, uint32_t max_threads, uint32_t size_cs_entry)
 {
   assert(gpgpu);
   memset(gpgpu, 0, sizeof(*gpgpu));
@@ -245,16 +268,35 @@ static void sim_gpgpu_state_init(sim_gpgpu gpgpu, uint32_t max_threads, uint32_t
   gpgpu->max_threads = max_threads;
 }
 
-static void sim_gpgpu_states_setup(sim_gpgpu gpgpu, cl_gpgpu_kernel *kernel)
+static void
+sim_gpgpu_states_setup(sim_gpgpu gpgpu, cl_gpgpu_kernel *kernel)
 {
+  uint32_t i;
+  size_t sz = 0;
   cl_buffer_map(kernel->bo, 0);
   gpgpu->kernel = *(sim_kernel_cb **) cl_buffer_get_virtual(kernel->bo);
+  gpgpu->thread_n = kernel->thread_n;
+
+  /* Because of flat address space and because the host machine can be 64 bits
+   * and Gen 32 bits, we just create a fake memory space of 1GB and copy back
+   * and forth the data from here
+   */
+  for (i = 0; i < gpgpu->binded_n; ++i) {
+    gpgpu->memory_remap[i] = sz;
+    sz += gpgpu->binded_buf[i]->sz;
+  }
+
+  /* Copy everything to the fake address space */
+  if (gpgpu->fake_memory) cl_free(gpgpu->fake_memory);
+  gpgpu->fake_memory = cl_malloc(sz);
+  for (i = 0; i < gpgpu->binded_n; ++i) {
+    const sim_buffer buf = gpgpu->binded_buf[i];
+    memcpy(gpgpu->fake_memory + gpgpu->memory_remap[i], buf->data, buf->sz);
+  }
 }
 
-static void sim_gpgpu_bind_buf(sim_gpgpu gpgpu,
-                               sim_buffer buf,
-                               uint32_t offset,
-                               uint32_t cchint)
+static void
+sim_gpgpu_bind_buf(sim_gpgpu gpgpu, sim_buffer buf, uint32_t offset, uint32_t cchint)
 {
   assert(gpgpu->binded_n < max_buf_n);
   gpgpu->binded_buf[gpgpu->binded_n] = buf;
@@ -262,12 +304,13 @@ static void sim_gpgpu_bind_buf(sim_gpgpu gpgpu,
   gpgpu->binded_n++;
 }
 
-static void sim_gpgpu_walker(sim_gpgpu gpgpu,
-                             uint32_t simd_sz,
-                             uint32_t thread_n,
-                             const size_t global_wk_off[3],
-                             const size_t global_wk_sz[3],
-                             const size_t local_wk_sz[3])
+static void
+sim_gpgpu_walker(sim_gpgpu gpgpu,
+                 uint32_t simd_sz,
+                 uint32_t thread_n,
+                 const size_t global_wk_off[3],
+                 const size_t global_wk_sz[3],
+                 const size_t local_wk_sz[3])
 {
   uint32_t x, y, z, t, i;
   const uint32_t global_wk_dim[3] = {
@@ -277,34 +320,20 @@ static void sim_gpgpu_walker(sim_gpgpu gpgpu,
   };
   assert(simd_sz == 8 || simd_sz == 16);
 
-  /* Because of flat address space and because the host machine can be 64 bits
-   * and gen 32 bits, we just create a fake memory space of 1GB and copy back
-   * and forth the data from here
-   */
-  size_t sz = 0;
-  uint32_t memory_remap[max_buf_n];
-  for (i = 0; i < gpgpu->binded_n; ++i) {
-    memory_remap[i] = sz;
-    sz += gpgpu->binded_buf[i]->sz;
-  }
-
-  /* Copy everything to the fake address space */
-  gpgpu->fake_memory = cl_malloc(sz);
-  for (i = 0; i < gpgpu->binded_n; ++i) {
-    const sim_buffer buf = gpgpu->binded_buf[i];
-    memcpy(gpgpu->fake_memory + memory_remap[i], buf->data, buf->sz);
-  }
-
+  gbe_simulator sim = sim_simulator_new();
+  sim->set_base_address(sim, gpgpu->fake_memory);
+  sim->set_curbe_address(sim, gpgpu->curbe);
   for (z = 0; z < global_wk_dim[2]; ++z)
   for (y = 0; y < global_wk_dim[1]; ++y)
   for (x = 0; x < global_wk_dim[0]; ++x)
   for (t = 0; t < thread_n; ++t)
-    gpgpu->kernel();
+    gpgpu->kernel(sim, t, x, y, z);
+  sim_simulator_delete(sim);
 
   /* Get the results back*/
   for (i = 0; i < gpgpu->binded_n; ++i) {
     const sim_buffer buf = gpgpu->binded_buf[i];
-    memcpy(buf->data, gpgpu->fake_memory + memory_remap[i], buf->sz);
+    memcpy(buf->data, gpgpu->fake_memory + gpgpu->memory_remap[i], buf->sz);
   }
   cl_free(gpgpu->fake_memory);
   gpgpu->fake_memory = NULL;
diff --git a/src/sim/sim_simulator.c b/src/sim/sim_simulator.c
new file mode 100644
index 0000000..667f02d
--- /dev/null
+++ b/src/sim/sim_simulator.c
@@ -0,0 +1,69 @@
+/* 
+ * Copyright Â© 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "sim/sim_simulator.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include <stdlib.h>
+
+/* Implement of the simulator interface */
+struct _sim_simulator {
+  struct _gbe_simulator internal; /* Contains the call backs */
+  void *base_address;             /* Base address of the fake address space */
+  void *curbe_address;            /* Curbe address */
+};
+typedef struct _sim_simulator *sim_simulator;
+
+static void *sim_get_base_address(sim_simulator sim) {
+  return sim->base_address;
+}
+static void sim_set_base_address(sim_simulator sim, void *addr) {
+  sim->base_address = addr;
+}
+static void *sim_get_curbe_address(sim_simulator sim) {
+  return sim->curbe_address;
+}
+static void sim_set_curbe_address(sim_simulator sim, void *addr) {
+  sim->curbe_address = addr;
+}
+
+LOCAL void
+sim_simulator_delete(gbe_simulator sim) {
+  if (UNLIKELY(sim == NULL)) return;
+  cl_free(sim);
+}
+
+LOCAL gbe_simulator
+sim_simulator_new(void)
+{
+  sim_simulator sim;
+  TRY_ALLOC_NO_ERR(sim, cl_calloc(1, sizeof(struct _sim_simulator)));
+  sim->internal.get_base_address = (sim_get_base_address_cb*) sim_get_base_address;
+  sim->internal.set_base_address = (sim_set_base_address_cb*) sim_set_base_address;
+  sim->internal.get_curbe_address = (sim_get_curbe_address_cb*) sim_get_curbe_address;
+  sim->internal.set_curbe_address = (sim_set_curbe_address_cb*) sim_set_curbe_address;
+
+exit:
+  return (gbe_simulator) sim;
+error:
+  sim_simulator_delete((gbe_simulator) sim);
+  sim = NULL;
+  goto exit;
+}
+
diff --git a/src/sim/sim_simulator.h b/src/sim/sim_simulator.h
new file mode 100644
index 0000000..55e84fa
--- /dev/null
+++ b/src/sim/sim_simulator.h
@@ -0,0 +1,31 @@
+/* 
+ * Copyright Â© 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#ifndef __SIM_SIMULATOR_H__
+#define __SIM_SIMULATOR_H__
+
+#include "gen/simulator.h"
+
+/* Allocate and initialize a new Gen simulator that run the c++ backend code */
+extern gbe_simulator sim_simulator_new(void);
+/* Destroy a Gen simulator */
+extern void sim_simulator_delete(gbe_simulator);
+
+#endif /* __SIM_GEN_SIMULATOR_H__ */
+
-- 
2.7.4