#undef DECL_PUBLIC_WORK_ITEM_FN
inline unsigned int get_global_id(unsigned int dim) {
- return get_local_id(dim) + get_local_size(dim) * get_num_groups(dim);
+ return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);
}
__attribute__ ((pure,const,overloadable)) float mad(float a, float b, float c);
--- /dev/null
+#include "stdlib.h"
+__kernel void
+test_copy_buffer(__global float* src, __global float* dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = src[id];
+}
+
--- /dev/null
+; ModuleID = 'test_copy_buffer.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_kernel void @test_copy_buffer(float addrspace(1)* nocapture %src, float addrspace(1)* nocapture %dst) nounwind noinline {
+get_global_id.exit:
+ %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+ %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_group_id0() nounwind readnone
+ %mul.i = mul i32 %call.i10.i, %call.i3.i
+ %add.i = add i32 %mul.i, %call.i.i
+ %arrayidx = getelementptr inbounds float addrspace(1)* %src, i32 %add.i
+ %0 = load float addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %arrayidx1 = getelementptr inbounds float addrspace(1)* %dst, i32 %add.i
+ store float %0, float addrspace(1)* %arrayidx1, align 4, !tbaa !1
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_group_id0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (float addrspace(1)*, float addrspace(1)*)* @test_copy_buffer}
+!1 = metadata !{metadata !"float", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
get_global_id.exit:
%call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
%call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
- %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+ %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_group_id0() nounwind readnone
%mul.i = mul i32 %call.i10.i, %call.i3.i
%add.i = add i32 %mul.i, %call.i.i
%arrayidx = getelementptr inbounds float addrspace(1)* %dst, i32 %add.i
ret void
}
-declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+declare ptx_device i32 @__gen_ocl_get_group_id0() nounwind readnone
declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
TARGET_LINK_LIBRARIES(cl_test cl)
ADD_EXECUTABLE(test_write_only tests/test_write_only.c)
+ADD_EXECUTABLE(test_copy_buffer tests/test_copy_buffer.c)
ADD_EXECUTABLE(test_eot tests/test_eot.c)
TARGET_LINK_LIBRARIES(test_eot cl_test m)
TARGET_LINK_LIBRARIES(test_write_only cl_test m)
+TARGET_LINK_LIBRARIES(test_copy_buffer cl_test m)
#ADD_EXECUTABLE(test_copy_buffer tests/test_copy_buffer.c)
#ADD_EXECUTABLE(test_copy_image tests/test_copy_image.c)
cl_buffer private_bo = NULL, scratch_bo = NULL;
cl_gpgpu_kernel kernel;
const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
- size_t i, batch_sz = 0u, local_sz = 0u, cst_sz = 0u, local_id_sz = 0u;
+ size_t i, batch_sz = 0u, local_sz = 0u, local_id_sz = 0u, cst_sz = ker->curbe_sz;
size_t thread_n = 0u, id_offset = 0u;
cl_int err = CL_SUCCESS;
kernel.barrierID = 0;
kernel.use_barrier = 0;
kernel.slm_sz = 0;
- kernel.cst_sz = 0;
- /* Fill the constant buffer */
+ /* Curbe step 1: fill the constant buffer data shared by all threads */
curbe = alloca(ker->curbe_sz);
cl_curbe_fill(ker, curbe, global_wk_off, global_wk_sz, local_wk_sz);
cl_command_queue_bind_surface(queue, ker, curbe, NULL, &private_bo, &scratch_bo, 0);
cl_gpgpu_states_setup(gpgpu, &kernel);
- /* CURBE step 2. Give the localID and upload it to video memory */
+ /* Curbe step 2. Give the localID and upload it to video memory */
TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
if (curbe)
for (i = 0; i < thread_n; ++i)
#include <stdlib.h>
#include <string.h>
}
+
namespace
{
- /* Just use c++ pre-main to initialize the call-backs */
- struct CallBackInitializer
+ /*! Just use c++ pre-main to initialize the call-backs */
+ struct OCLDriverCallBackInitializer
{
- CallBackInitializer(void) {
+ OCLDriverCallBackInitializer(void) {
const char *run_it = getenv("OCL_SIMULATOR");
if (run_it != NULL && !strcmp(run_it, "2"))
sim_setup_callbacks();
}
};
- /* Set the call backs at pre-main time */
- LOCAL CallBackInitializer cbInitializer;
-}
+ /*! Set the call backs at pre-main time */
+ static OCLDriverCallBackInitializer cbInitializer;
+} /* namespace */
/* Create the curbe */
k->curbe_sz = gbe_kernel_get_curbe_size(k->opaque);
- TRY_ALLOC_NO_ERR(k->curbe, cl_malloc(k->curbe_sz));
- return;
-
-error:
- if (k->curbe) cl_free(k->curbe);
- if (k->bo) cl_buffer_unreference(k->bo);
- k->curbe = NULL;
- k->bo = NULL;
- return;
}
LOCAL cl_kernel
to->magic = CL_MAGIC_KERNEL_HEADER;
to->program = from->program;
to->arg_n = from->arg_n;
+ to->curbe_sz = from->curbe_sz;
TRY_ALLOC_NO_ERR(to->args, cl_calloc(to->arg_n, sizeof(cl_argument)));
+ if (to->curbe_sz) TRY_ALLOC_NO_ERR(to->curbe, cl_calloc(1, to->curbe_sz));
/* Retain the bos */
if (from->bo) cl_buffer_reference(from->bo);
uint32_t binded_offset[max_buf_n]; /* their offsets in the constant buffer */
uint32_t memory_remap[max_buf_n]; /* offset of each buffer in the fake memory space */
uint32_t max_threads; /* HW threads running */
- uint32_t cst_sz; /* size of the constant buffer */
+ uint32_t curbe_sz; /* size of curbe used per HW thread */
uint32_t binded_n; /* number of buffers binded */
uint32_t thread_n; /* number of threads to run per work group */
};
sim_gpgpu_upload_constants(sim_gpgpu gpgpu, const void* data, uint32_t size)
{
uint32_t i, j;
- assert(size == gpgpu->cst_sz * gpgpu->thread_n);
+ assert(size == gpgpu->curbe_sz * gpgpu->thread_n);
if (gpgpu->curbe) cl_free(gpgpu->curbe);
gpgpu->curbe = (char*) cl_malloc(size);
+ memcpy(gpgpu->curbe, data, size);
/* Upload the buffer offsets per thread */
for (i = 0; i < gpgpu->thread_n; ++i) {
- const uint32_t start_offset = i * gpgpu->cst_sz;
+ const uint32_t start_offset = i * gpgpu->curbe_sz;
for (j = 0; j < gpgpu->binded_n; ++j) {
const uint32_t offset = start_offset + gpgpu->binded_offset[j];
const uint32_t fake_address = gpgpu->memory_remap[j];
{
assert(gpgpu);
memset(gpgpu, 0, sizeof(*gpgpu));
- gpgpu->cst_sz = size_cs_entry * 32;
+ gpgpu->curbe_sz = size_cs_entry * 32;
gpgpu->max_threads = max_threads;
}
gbe_simulator sim = sim_simulator_new();
sim->set_base_address(sim, gpgpu->fake_memory);
sim->set_curbe_address(sim, gpgpu->curbe);
+ sim->set_curbe_size(sim, gpgpu->curbe_sz);
for (z = 0; z < global_wk_dim[2]; ++z)
for (y = 0; y < global_wk_dim[1]; ++y)
for (x = 0; x < global_wk_dim[0]; ++x)