From f98a8cfb7eec88369ca7059b4a338d33d9bdd003 Mon Sep 17 00:00:00 2001 From: Benjamin Segovia Date: Sun, 8 Apr 2012 22:51:40 -0700 Subject: [PATCH] Made most of the very basic pieces for the simulation parts --- kernels/compile.sh | 6 ++ kernels/stdlib.h | 114 +++++++++++++++++++++++++++++++++++++ kernels/test_write_only_2.cl | 8 +++ kernels/test_write_only_2.cl.ll | 28 +++++++++ src/CMakeLists.txt | 5 ++ src/cl_api.c | 10 ++-- src/cl_command_queue.c | 28 ++++----- src/cl_command_queue_gen7.c | 2 +- src/cl_kernel.c | 62 +++++++++++++++++++- src/cl_kernel.h | 32 +++++++---- src/sim/sim_driver.c | 123 +++++++++++++++++++++++++--------------- src/sim/sim_simulator.c | 69 ++++++++++++++++++++++ src/sim/sim_simulator.h | 31 ++++++++++ 13 files changed, 434 insertions(+), 84 deletions(-) create mode 100755 kernels/compile.sh create mode 100644 kernels/stdlib.h create mode 100644 kernels/test_write_only_2.cl create mode 100644 kernels/test_write_only_2.cl.ll create mode 100644 src/sim/sim_simulator.c create mode 100644 src/sim/sim_simulator.h diff --git a/kernels/compile.sh b/kernels/compile.sh new file mode 100755 index 0000000..e1177a7 --- /dev/null +++ b/kernels/compile.sh @@ -0,0 +1,6 @@ +#!/bin/bash +clang -emit-llvm -O3 -ccc-host-triple ptx32 -c $1 -o $1.o +llvm-dis $1.o +rm $1.o +mv $1.o.ll $1.ll + diff --git a/kernels/stdlib.h b/kernels/stdlib.h new file mode 100644 index 0000000..472655a --- /dev/null +++ b/kernels/stdlib.h @@ -0,0 +1,114 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + * + * Author: Benjamin Segovia + */ + +#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \ +__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void); \ +__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void); \ +__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##2(void); +DECL_INTERNAL_WORK_ITEM_FN(get_group_id) +DECL_INTERNAL_WORK_ITEM_FN(get_local_id) +DECL_INTERNAL_WORK_ITEM_FN(get_local_size) +DECL_INTERNAL_WORK_ITEM_FN(get_global_size) +DECL_INTERNAL_WORK_ITEM_FN(get_num_groups) +#undef DECL_INTERNAL_WORK_ITEM_FN + +#define DECL_PUBLIC_WORK_ITEM_FN(NAME) \ +inline unsigned NAME(unsigned int dim) { \ + if (dim == 0) return __gen_ocl_##NAME##0(); \ + else if (dim == 1) return __gen_ocl_##NAME##1(); \ + else if (dim == 2) return __gen_ocl_##NAME##2(); \ + else return 0; \ +} +DECL_PUBLIC_WORK_ITEM_FN(get_group_id) +DECL_PUBLIC_WORK_ITEM_FN(get_local_id) +DECL_PUBLIC_WORK_ITEM_FN(get_local_size) +DECL_PUBLIC_WORK_ITEM_FN(get_global_size) +DECL_PUBLIC_WORK_ITEM_FN(get_num_groups) +#undef DECL_PUBLIC_WORK_ITEM_FN + +inline unsigned int get_global_id(unsigned int dim) { + return get_local_id(dim) + get_local_size(dim) * get_num_groups(dim); +} + +__attribute__ ((pure,const,overloadable)) float mad(float a, float b, float c); +__attribute__((overloadable)) inline unsigned select(unsigned src0, unsigned src1, unsigned cond) { + return cond ? src0 : src1; +} +__attribute__((overloadable)) inline int select(int src0, int src1, int cond) { + return cond ? src0 : src1; +} + +typedef float float2 __attribute__((ext_vector_type(2))); +typedef float float3 __attribute__((ext_vector_type(3))); +typedef float float4 __attribute__((ext_vector_type(4))); +typedef int int2 __attribute__((ext_vector_type(2))); +typedef int int3 __attribute__((ext_vector_type(3))); +typedef int int4 __attribute__((ext_vector_type(4))); +typedef int uint2 __attribute__((ext_vector_type(2))); +typedef unsigned uint3 __attribute__((ext_vector_type(3))); +typedef unsigned uint4 __attribute__((ext_vector_type(4))); +typedef bool bool2 __attribute__((ext_vector_type(2))); +typedef bool bool3 __attribute__((ext_vector_type(3))); +typedef bool bool4 __attribute__((ext_vector_type(4))); + +// This will be optimized out by LLVM and will output LLVM select instructions +#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \ +__attribute__((overloadable)) \ +inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \ + TYPE4 dst; \ + const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \ + const TYPE x1 = src1.x; \ + const TYPE y0 = src0.y; \ + const TYPE y1 = src1.y; \ + const TYPE z0 = src0.z; \ + const TYPE z1 = src1.z; \ + const TYPE w0 = src0.w; \ + const TYPE w1 = src1.w; \ + \ + dst.x = (cond.x & MASK) ? x1 : x0; \ + dst.y = (cond.y & MASK) ? y1 : y0; \ + dst.z = (cond.z & MASK) ? z1 : z0; \ + dst.w = (cond.w & MASK) ? w1 : w0; \ + return dst; \ +} +DECL_SELECT4(int4, int, int4, 0x80000000) +DECL_SELECT4(float4, float, int4, 0x80000000) +#undef DECL_SELECT4 + +__attribute__((overloadable,always_inline)) inline float2 mad(float2 a, float2 b, float2 c) { + return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y)); +} +__attribute__((overloadable,always_inline)) inline float3 mad(float3 a, float3 b, float3 c) { + return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z)); +} +__attribute__((overloadable,always_inline)) inline float4 mad(float4 a, float4 b, float4 c) { + return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), + mad(a.z,b.z,c.z), mad(a.w,b.w,c.w)); +} + +#define __private __attribute__((address_space(0))) +#define __global __attribute__((address_space(1))) +#define __constant __attribute__((address_space(2))) +//#define __local __attribute__((address_space(3))) +#define global __global +//#define local __local +#define constant __constant +#define private __private + +#define NULL ((void*)0) diff --git a/kernels/test_write_only_2.cl b/kernels/test_write_only_2.cl new file mode 100644 index 0000000..8edd39a --- /dev/null +++ b/kernels/test_write_only_2.cl @@ -0,0 +1,8 @@ +#include "stdlib.h" +__kernel void +test_write_only(__global float* dst ) +{ + int id = (int)get_global_id(0); + dst[id] = 1; +} + diff --git a/kernels/test_write_only_2.cl.ll b/kernels/test_write_only_2.cl.ll new file mode 100644 index 0000000..3d7e493 --- /dev/null +++ b/kernels/test_write_only_2.cl.ll @@ -0,0 +1,28 @@ +; ModuleID = 'test_write_only_2.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_kernel void @test_write_only(float addrspace(1)* nocapture %dst) nounwind noinline { +get_global_id.exit: + %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone + %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone + %mul.i = mul i32 %call.i10.i, %call.i3.i + %add.i = add i32 %mul.i, %call.i.i + %arrayidx = getelementptr inbounds float addrspace(1)* %dst, i32 %add.i + store float 1.000000e+00, float addrspace(1)* %arrayidx, align 4, !tbaa !1 + ret void +} + +declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + +!opencl.kernels = !{!0} + +!0 = metadata !{void (float addrspace(1)*)* @test_write_only} +!1 = metadata !{metadata !"float", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9e44bc7..035e941 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -17,10 +17,15 @@ SET(OPENCL_SRC cl_device_id.c cl_context.c cl_command_queue.c + cl_command_queue.h cl_command_queue_gen7.c cl_driver.c + cl_driver.h cl_driver.cpp sim/sim_driver.c + sim/sim_driver.h + sim/sim_simulator.c + sim/sim_simulator.h intel/intel_gpgpu.c intel/intel_batchbuffer.c intel/intel_driver.c diff --git a/src/cl_api.c b/src/cl_api.c index 2271be9..aa7b664 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -621,11 +621,11 @@ clGetKernelWorkGroupInfo(cl_kernel kernel, void * param_value, size_t * param_value_size_ret) { - return cl_get_kernel_workgroup_info(device, - param_name, - param_value_size, - param_value, - param_value_size_ret); + return cl_get_kernel_workgroup_info(device, + param_name, + param_value_size, + param_value, + param_value_size_ret); } cl_int diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 79f67a9..06eada9 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -105,26 +105,18 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_buffer *scratch, uint32_t local_sz) { -#if 0 - cl_context ctx = queue->ctx; - cl_gpgpu gpgpu = queue->gpgpu; - cl_buffer_mgr bufmgr = cl_context_get_bufmgr(ctx); - cl_buffer sync_bo = NULL; - cl_int err = CL_SUCCESS; - - /* Now bind a bo used for synchronization */ - sync_bo = cl_buffer_alloc(bufmgr, "sync surface", 64, 64); - // cl_gpgpu_bind_buf(gpgpu, GEN_MAX_SURFACES-1, sync_bo, cc_llc_l3); - if (queue->last_batch != NULL) - cl_buffer_unreference(queue->last_batch); - queue->last_batch = sync_bo; + /* Bind all user buffers (given by clSetKernelArg) */ + uint32_t i; + for (i = 0; i < k->arg_n; ++k) { + uint32_t offset; // location of the address in the curbe + if (gbe_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR && + gbe_kernel_get_arg_type(k->opaque, i) != GBE_ARG_CONSTANT_PTR) + continue; + offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i); + cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, cc_llc_l3); + } -// error: - assert(err == CL_SUCCESS); /* Cannot fail here */ - return err; -#else return CL_SUCCESS; -#endif } #if USE_FULSIM diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 70a1827..d35bf7d 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -87,7 +87,7 @@ cl_curbe_fill(cl_kernel ker, const size_t *global_wk_sz, const size_t *local_wk_sz) { - uint32_t offset; + int32_t offset; #define UPLOAD(ENUM, VALUE) \ if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, ENUM, 0)) >= 0) \ *((uint32_t *) (curbe + offset)) = VALUE; diff --git a/src/cl_kernel.c b/src/cl_kernel.c index 5b9f8bc..36064a4 100644 --- a/src/cl_kernel.c +++ b/src/cl_kernel.c @@ -35,6 +35,7 @@ LOCAL void cl_kernel_delete(cl_kernel k) { + uint32_t i; if (k == NULL) return; /* We are not done with the kernel */ @@ -46,6 +47,13 @@ cl_kernel_delete(cl_kernel k) if (k->ref_its_program) cl_program_delete(k->program); /* Release the curbe if allocated */ if (k->curbe) cl_free(k->curbe); + /* Release the argument array if required */ + if (k->args) { + for (i = 0; i < k->arg_n; ++i) + if (k->args[i].mem != NULL) + cl_mem_delete(k->args[i].mem); + cl_free(k->args); + } k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */ cl_free(k); } @@ -83,9 +91,56 @@ cl_kernel_add_ref(cl_kernel k) LOCAL cl_int cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value) { - cl_int err = CL_SUCCESS; + uint32_t offset; /* where to patch */ + enum gbe_arg_type arg_type; /* kind of argument */ + size_t arg_sz; /* size of the argument */ + cl_mem mem; /* for __global, __constant and image arguments */ + + if (UNLIKELY(index >= k->arg_n)) + return CL_INVALID_ARG_INDEX; + arg_type = gbe_kernel_get_arg_type(k->opaque, index); + arg_sz = gbe_kernel_get_arg_size(k->opaque, index); + if (UNLIKELY(arg_sz != sz)) + return CL_INVALID_ARG_SIZE; + + /* Copy the structure or the value directly into the curbe */ + if (arg_type == GBE_ARG_VALUE) { + if (UNLIKELY(value == NULL)) + return CL_INVALID_KERNEL_ARGS; + offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index); + assert(offset + sz <= k->curbe_sz); + memcpy(k->curbe + offset, value, sz); + k->args[index].local_sz = 0; + k->args[index].is_set = 1; + k->args[index].mem = NULL; + return CL_SUCCESS; + } - return err; + /* For a local pointer just save the size */ + if (arg_type == GBE_ARG_LOCAL_PTR) { + if (UNLIKELY(value != NULL)) + return CL_INVALID_KERNEL_ARGS; + k->args[index].local_sz = sz; + k->args[index].is_set = 1; + k->args[index].mem = NULL; + return CL_SUCCESS; + } + + /* Otherwise, we just need to check that this is a buffer */ + if (UNLIKELY(value == NULL)) + return CL_INVALID_KERNEL_ARGS; + mem = *(cl_mem*) value; + if (UNLIKELY(mem->magic != CL_MAGIC_MEM_HEADER)) + return CL_INVALID_ARG_VALUE; + if (mem->is_image) + if (UNLIKELY(arg_type == GBE_ARG_IMAGE)) + return CL_INVALID_ARG_VALUE; + cl_mem_add_ref(mem); + k->args[index].mem = mem; + k->args[index].is_set = 1; + k->args[index].local_sz = 0; + + return CL_SUCCESS; } LOCAL uint32_t @@ -105,6 +160,7 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque) const uint32_t code_sz = gbe_kernel_get_code_size(opaque); const char *code = gbe_kernel_get_code(opaque); k->bo = cl_buffer_alloc(bufmgr, "CL kernel", code_sz, 64u); + k->arg_n = gbe_kernel_get_arg_num(opaque); /* Upload the code */ cl_buffer_subdata(k->bo, 0, code_sz, code); @@ -137,6 +193,8 @@ cl_kernel_dup(cl_kernel from) to->ref_n = 1; to->magic = CL_MAGIC_KERNEL_HEADER; to->program = from->program; + to->arg_n = from->arg_n; + TRY_ALLOC_NO_ERR(to->args, cl_calloc(to->arg_n, sizeof(cl_argument))); /* Retain the bos */ if (from->bo) cl_buffer_reference(from->bo); diff --git a/src/cl_kernel.h b/src/cl_kernel.h index 545c268..1906cec 100644 --- a/src/cl_kernel.h +++ b/src/cl_kernel.h @@ -32,18 +32,28 @@ /* This is the kernel as it is interfaced by the compiler */ struct _gbe_kernel; -/*! One OCL function */ +/* We need to save buffer data for relocation and binding and we must figure out + * if all arguments are properly set + */ +typedef struct cl_argument { + cl_mem mem; /* For image and regular buffers */ + uint32_t local_sz:31; /* For __local size specification */ + uint32_t is_set:1; /* All args must be set before NDRange */ +} cl_argument; + +/* One OCL function */ struct _cl_kernel { - uint64_t magic; /* To identify it as a kernel */ - volatile int ref_n; /* We reference count this object */ - cl_buffer bo; /* The code itself */ - cl_buffer const_bo; /* Buffer for all __constants values in the OCL program */ - cl_program program; /* Owns this structure (and pointers) */ - gbe_kernel opaque; /* (Opaque) compiler structure for the OCL kernel */ - char *curbe; /* One curbe per kernel */ - size_t curbe_sz; /* Size of it */ - - uint8_t ref_its_program; /* True only for the user kernel (created by clCreateKernel) */ + uint64_t magic; /* To identify it as a kernel */ + volatile int ref_n; /* We reference count this object */ + cl_buffer bo; /* The code itself */ + cl_buffer const_bo; /* Buffer for all __constants values in the OCL program */ + cl_program program; /* Owns this structure (and pointers) */ + gbe_kernel opaque; /* (Opaque) compiler structure for the OCL kernel */ + char *curbe; /* One curbe per kernel */ + size_t curbe_sz; /* Size of it */ + cl_argument *args; /* To track argument setting */ + uint32_t arg_n:31; /* Number of arguments */ + uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */ }; /* Allocate an empty kernel */ diff --git a/src/sim/sim_driver.c b/src/sim/sim_driver.c index aa6c7ac..3fa224c 100644 --- a/src/sim/sim_driver.c +++ b/src/sim/sim_driver.c @@ -22,12 +22,16 @@ #include "cl_device_data.h" #include "sim/sim_driver.h" #include "CL/cl.h" +#include "cl_driver.h" +#include "gen/program.h" +#include "gen/simulator.h" +#include "sim/sim_simulator.h" + #include #include #include #include -#include "cl_driver.h" /* Fake buffer manager that just counts allocations */ struct _sim_bufmgr { volatile int buf_n; }; @@ -39,11 +43,7 @@ sim_bufmgr_new(void) return cl_calloc(1,sizeof(struct _sim_bufmgr)); } -static void -sim_bufmgr_delete(sim_bufmgr bufmgr) -{ - cl_free(bufmgr); -} +static void sim_bufmgr_delete(sim_bufmgr bufmgr) { cl_free(bufmgr); } /* Fake low-level driver */ struct _sim_driver { @@ -91,7 +91,8 @@ sim_driver_get_bufmgr(sim_driver driver) static int sim_driver_get_device_id(void) { - return PCI_CHIP_IVYBRIDGE_GT2; // XXX get some env variable instead + /* XXX get some env variable instead */ + return PCI_CHIP_IVYBRIDGE_GT2; } /* Just a named buffer to mirror real drm functions */ @@ -184,7 +185,7 @@ static int sim_buffer_unpin(sim_buffer buf) {return 0;} static int sim_buffer_wait_rendering(sim_buffer buf) {return 0;} /* Function to call for each HW thread we simulate */ -typedef void (sim_kernel_cb)(void); +typedef void (sim_kernel_cb)(gbe_simulator, uint32_t, uint32_t, uint32_t, uint32_t); /* We can bind only a limited number of buffers */ enum { max_buf_n = 128 }; @@ -192,19 +193,23 @@ enum { max_buf_n = 128 }; /* Encapsulates operations needed to run one NDrange */ struct _sim_gpgpu { - sim_driver driver; /* the driver the gpgpu states belongs to */ - sim_kernel_cb *kernel; /* call it for each HW thread */ + sim_driver driver; /* the driver the gpgpu states belongs to */ + sim_kernel_cb *kernel; /* call it for each HW thread */ sim_buffer binded_buf[max_buf_n]; /* all buffers binded for the call */ char *fake_memory; /* fake memory to emulate flat address space in any mode (32 / 64 bits) */ + char *curbe; /* constant buffer */ uint32_t binded_offset[max_buf_n]; /* their offsets in the constant buffer */ + uint32_t memory_remap[max_buf_n]; /* offset of each buffer in the fake memory space */ uint32_t max_threads; /* HW threads running */ uint32_t cst_sz; /* size of the constant buffer */ uint32_t binded_n; /* number of buffers binded */ + uint32_t thread_n; /* number of threads to run per work group */ }; typedef struct _sim_gpgpu *sim_gpgpu; -static void sim_gpgpu_delete(sim_gpgpu gpgpu) -{ +static void sim_gpgpu_delete(sim_gpgpu gpgpu) { + if (gpgpu->fake_memory) cl_free(gpgpu->fake_memory); + if (gpgpu->curbe) cl_free(gpgpu->curbe); cl_free(gpgpu); } @@ -212,7 +217,6 @@ static sim_gpgpu sim_gpgpu_new(sim_driver driver) { sim_gpgpu gpgpu = NULL; TRY_ALLOC_NO_ERR(gpgpu, cl_calloc(1, sizeof(struct _sim_gpgpu))); - exit: return gpgpu; error: @@ -223,21 +227,40 @@ error: static void sim_gpgpu_bind_image2D(sim_gpgpu gpgpu, int32_t index, - sim_buffer obj_bo, + sim_buffer bo, uint32_t format, int32_t w, int32_t h, int pitch, cl_gpgpu_tiling tiling) {} static void sim_gpgpu_set_perf_counters(sim_gpgpu gpgpu, sim_buffer perf) {} -static void sim_gpgpu_upload_constants(sim_gpgpu gpgpu, const void* data, uint32_t size) {} static void sim_gpgpu_upload_samplers(sim_gpgpu gpgpu, const void *data, uint32_t n) {} static void sim_gpgpu_batch_reset(sim_gpgpu gpgpu, size_t sz) {} static void sim_gpgpu_batch_start(sim_gpgpu gpgpu) {} static void sim_gpgpu_batch_end(sim_gpgpu gpgpu, uint32_t flush_mode) {} static void sim_gpgpu_flush(sim_gpgpu gpgpu) {} -static void sim_gpgpu_state_init(sim_gpgpu gpgpu, uint32_t max_threads, uint32_t size_cs_entry) +static void +sim_gpgpu_upload_constants(sim_gpgpu gpgpu, const void* data, uint32_t size) +{ + uint32_t i, j; + assert(size == gpgpu->cst_sz * gpgpu->thread_n); + if (gpgpu->curbe) cl_free(gpgpu->curbe); + gpgpu->curbe = (char*) cl_malloc(size); + + /* Upload the buffer offsets per thread */ + for (i = 0; i < gpgpu->thread_n; ++i) { + const uint32_t start_offset = i * gpgpu->cst_sz; + for (j = 0; j < gpgpu->binded_n; ++j) { + const uint32_t offset = start_offset + gpgpu->binded_offset[j]; + const uint32_t fake_address = gpgpu->memory_remap[j]; + *(uint32_t*) (gpgpu->curbe + offset) = fake_address; /* XXX 32 bits only */ + } + } +} + +static void +sim_gpgpu_state_init(sim_gpgpu gpgpu, uint32_t max_threads, uint32_t size_cs_entry) { assert(gpgpu); memset(gpgpu, 0, sizeof(*gpgpu)); @@ -245,16 +268,35 @@ static void sim_gpgpu_state_init(sim_gpgpu gpgpu, uint32_t max_threads, uint32_t gpgpu->max_threads = max_threads; } -static void sim_gpgpu_states_setup(sim_gpgpu gpgpu, cl_gpgpu_kernel *kernel) +static void +sim_gpgpu_states_setup(sim_gpgpu gpgpu, cl_gpgpu_kernel *kernel) { + uint32_t i; + size_t sz = 0; cl_buffer_map(kernel->bo, 0); gpgpu->kernel = *(sim_kernel_cb **) cl_buffer_get_virtual(kernel->bo); + gpgpu->thread_n = kernel->thread_n; + + /* Because of flat address space and because the host machine can be 64 bits + * and Gen 32 bits, we just create a fake memory space of 1GB and copy back + * and forth the data from here + */ + for (i = 0; i < gpgpu->binded_n; ++i) { + gpgpu->memory_remap[i] = sz; + sz += gpgpu->binded_buf[i]->sz; + } + + /* Copy everything to the fake address space */ + if (gpgpu->fake_memory) cl_free(gpgpu->fake_memory); + gpgpu->fake_memory = cl_malloc(sz); + for (i = 0; i < gpgpu->binded_n; ++i) { + const sim_buffer buf = gpgpu->binded_buf[i]; + memcpy(gpgpu->fake_memory + gpgpu->memory_remap[i], buf->data, buf->sz); + } } -static void sim_gpgpu_bind_buf(sim_gpgpu gpgpu, - sim_buffer buf, - uint32_t offset, - uint32_t cchint) +static void +sim_gpgpu_bind_buf(sim_gpgpu gpgpu, sim_buffer buf, uint32_t offset, uint32_t cchint) { assert(gpgpu->binded_n < max_buf_n); gpgpu->binded_buf[gpgpu->binded_n] = buf; @@ -262,12 +304,13 @@ static void sim_gpgpu_bind_buf(sim_gpgpu gpgpu, gpgpu->binded_n++; } -static void sim_gpgpu_walker(sim_gpgpu gpgpu, - uint32_t simd_sz, - uint32_t thread_n, - const size_t global_wk_off[3], - const size_t global_wk_sz[3], - const size_t local_wk_sz[3]) +static void +sim_gpgpu_walker(sim_gpgpu gpgpu, + uint32_t simd_sz, + uint32_t thread_n, + const size_t global_wk_off[3], + const size_t global_wk_sz[3], + const size_t local_wk_sz[3]) { uint32_t x, y, z, t, i; const uint32_t global_wk_dim[3] = { @@ -277,34 +320,20 @@ static void sim_gpgpu_walker(sim_gpgpu gpgpu, }; assert(simd_sz == 8 || simd_sz == 16); - /* Because of flat address space and because the host machine can be 64 bits - * and gen 32 bits, we just create a fake memory space of 1GB and copy back - * and forth the data from here - */ - size_t sz = 0; - uint32_t memory_remap[max_buf_n]; - for (i = 0; i < gpgpu->binded_n; ++i) { - memory_remap[i] = sz; - sz += gpgpu->binded_buf[i]->sz; - } - - /* Copy everything to the fake address space */ - gpgpu->fake_memory = cl_malloc(sz); - for (i = 0; i < gpgpu->binded_n; ++i) { - const sim_buffer buf = gpgpu->binded_buf[i]; - memcpy(gpgpu->fake_memory + memory_remap[i], buf->data, buf->sz); - } - + gbe_simulator sim = sim_simulator_new(); + sim->set_base_address(sim, gpgpu->fake_memory); + sim->set_curbe_address(sim, gpgpu->curbe); for (z = 0; z < global_wk_dim[2]; ++z) for (y = 0; y < global_wk_dim[1]; ++y) for (x = 0; x < global_wk_dim[0]; ++x) for (t = 0; t < thread_n; ++t) - gpgpu->kernel(); + gpgpu->kernel(sim, t, x, y, z); + sim_simulator_delete(sim); /* Get the results back*/ for (i = 0; i < gpgpu->binded_n; ++i) { const sim_buffer buf = gpgpu->binded_buf[i]; - memcpy(buf->data, gpgpu->fake_memory + memory_remap[i], buf->sz); + memcpy(buf->data, gpgpu->fake_memory + gpgpu->memory_remap[i], buf->sz); } cl_free(gpgpu->fake_memory); gpgpu->fake_memory = NULL; diff --git a/src/sim/sim_simulator.c b/src/sim/sim_simulator.c new file mode 100644 index 0000000..667f02d --- /dev/null +++ b/src/sim/sim_simulator.c @@ -0,0 +1,69 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + * + * Author: Benjamin Segovia + */ + +#include "sim/sim_simulator.h" +#include "cl_utils.h" +#include "cl_alloc.h" +#include + +/* Implement of the simulator interface */ +struct _sim_simulator { + struct _gbe_simulator internal; /* Contains the call backs */ + void *base_address; /* Base address of the fake address space */ + void *curbe_address; /* Curbe address */ +}; +typedef struct _sim_simulator *sim_simulator; + +static void *sim_get_base_address(sim_simulator sim) { + return sim->base_address; +} +static void sim_set_base_address(sim_simulator sim, void *addr) { + sim->base_address = addr; +} +static void *sim_get_curbe_address(sim_simulator sim) { + return sim->curbe_address; +} +static void sim_set_curbe_address(sim_simulator sim, void *addr) { + sim->curbe_address = addr; +} + +LOCAL void +sim_simulator_delete(gbe_simulator sim) { + if (UNLIKELY(sim == NULL)) return; + cl_free(sim); +} + +LOCAL gbe_simulator +sim_simulator_new(void) +{ + sim_simulator sim; + TRY_ALLOC_NO_ERR(sim, cl_calloc(1, sizeof(struct _sim_simulator))); + sim->internal.get_base_address = (sim_get_base_address_cb*) sim_get_base_address; + sim->internal.set_base_address = (sim_set_base_address_cb*) sim_set_base_address; + sim->internal.get_curbe_address = (sim_get_curbe_address_cb*) sim_get_curbe_address; + sim->internal.set_curbe_address = (sim_set_curbe_address_cb*) sim_set_curbe_address; + +exit: + return (gbe_simulator) sim; +error: + sim_simulator_delete((gbe_simulator) sim); + sim = NULL; + goto exit; +} + diff --git a/src/sim/sim_simulator.h b/src/sim/sim_simulator.h new file mode 100644 index 0000000..55e84fa --- /dev/null +++ b/src/sim/sim_simulator.h @@ -0,0 +1,31 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + * + * Author: Benjamin Segovia + */ + +#ifndef __SIM_SIMULATOR_H__ +#define __SIM_SIMULATOR_H__ + +#include "gen/simulator.h" + +/* Allocate and initialize a new Gen simulator that run the c++ backend code */ +extern gbe_simulator sim_simulator_new(void); +/* Destroy a Gen simulator */ +extern void sim_simulator_delete(gbe_simulator); + +#endif /* __SIM_GEN_SIMULATOR_H__ */ + -- 2.7.4