From b9686a0dd699fb4f33f4ed31c2d59a4c1b9560fe Mon Sep 17 00:00:00 2001 From: Zhigang Gong Date: Fri, 31 May 2013 12:56:57 +0800 Subject: [PATCH] GBE: Add two builtin functions get_work_dim / get_global_offset. Signed-off-by: Zhigang Gong Tested-by: Yi Sun --- backend/src/backend/context.cpp | 2 ++ backend/src/backend/gen_reg_allocation.cpp | 1 + backend/src/backend/program.h | 1 + backend/src/ir/profile.cpp | 1 + backend/src/ir/profile.hpp | 3 ++- backend/src/llvm/llvm_gen_backend.cpp | 2 ++ backend/src/llvm/llvm_gen_ocl_function.hxx | 1 + backend/src/ocl_stdlib.h | 9 +++++++-- src/cl_api.c | 1 + src/cl_command_queue.c | 5 +++-- src/cl_command_queue.h | 1 + src/cl_command_queue_gen7.c | 5 ++++- 12 files changed, 26 insertions(+), 6 deletions(-) diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp index af1f579..48160de 100644 --- a/backend/src/backend/context.cpp +++ b/backend/src/backend/context.cpp @@ -430,6 +430,7 @@ namespace gbe INSERT_REG(goffset0, GLOBAL_OFFSET_X, 1) INSERT_REG(goffset1, GLOBAL_OFFSET_Y, 1) INSERT_REG(goffset2, GLOBAL_OFFSET_Z, 1) + INSERT_REG(workdim, WORK_DIM, 1) INSERT_REG(numgroup0, GROUP_NUM_X, 1) INSERT_REG(numgroup1, GROUP_NUM_Y, 1) INSERT_REG(numgroup2, GROUP_NUM_Z, 1) @@ -621,6 +622,7 @@ namespace gbe reg == ir::ocl::goffset0 || reg == ir::ocl::goffset1 || reg == ir::ocl::goffset2 || + reg == ir::ocl::workdim || reg == ir::ocl::constoffst) return true; return false; diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp index 8c9f358..469be12 100644 --- a/backend/src/backend/gen_reg_allocation.cpp +++ b/backend/src/backend/gen_reg_allocation.cpp @@ -518,6 +518,7 @@ namespace gbe allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_X, ocl::goffset0); allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Y, ocl::goffset1); allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Z, ocl::goffset2); + allocatePayloadReg(GBE_CURBE_WORK_DIM, ocl::workdim); allocatePayloadReg(GBE_CURBE_GROUP_NUM_X, ocl::numgroup0); allocatePayloadReg(GBE_CURBE_GROUP_NUM_Y, ocl::numgroup1); allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2); diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h index f178f8b..f36bfbf 100644 --- a/backend/src/backend/program.h +++ b/backend/src/backend/program.h @@ -69,6 +69,7 @@ enum gbe_curbe_type { GBE_CURBE_GROUP_NUM_X, GBE_CURBE_GROUP_NUM_Y, GBE_CURBE_GROUP_NUM_Z, + GBE_CURBE_WORK_DIM, GBE_CURBE_GLOBAL_CONSTANT_OFFSET, GBE_CURBE_GLOBAL_CONSTANT_DATA, GBE_CURBE_IMAGE_INFO, diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp index c1dc650..99cd06c 100644 --- a/backend/src/ir/profile.cpp +++ b/backend/src/ir/profile.cpp @@ -76,6 +76,7 @@ namespace ir { DECL_NEW_REG(FAMILY_DWORD, barrierid); DECL_NEW_REG(FAMILY_DWORD, threadn); DECL_NEW_REG(FAMILY_DWORD, constoffst); + DECL_NEW_REG(FAMILY_DWORD, workdim); } #undef DECL_NEW_REG diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp index 32dd149..4b0ef5e 100644 --- a/backend/src/ir/profile.hpp +++ b/backend/src/ir/profile.hpp @@ -64,7 +64,8 @@ namespace ir { static const Register barrierid = Register(20);// barrierid static const Register threadn = Register(21); // number of threads static const Register constoffst = Register(22); // offset of global constant array's curbe - static const uint32_t regNum = 23; // number of special registers + static const Register workdim = Register(23); // work dimention. + static const uint32_t regNum = 24; // number of special registers extern const char *specialRegMean[]; // special register name. } /* namespace ocl */ diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index a9c726b..10188d0 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -1653,6 +1653,8 @@ namespace gbe regTranslator.newScalarProxy(ir::ocl::goffset1, dst); break; case GEN_OCL_GET_GLOBAL_OFFSET2: regTranslator.newScalarProxy(ir::ocl::goffset2, dst); break; + case GEN_OCL_GET_WORK_DIM: + regTranslator.newScalarProxy(ir::ocl::workdim, dst); break; case GEN_OCL_COS: case GEN_OCL_SIN: case GEN_OCL_SQR: diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index 0524744..6cd7298 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -16,6 +16,7 @@ DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE2, __gen_ocl_get_global_size2) DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET0, __gen_ocl_get_global_offset0) DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET1, __gen_ocl_get_global_offset1) DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET2, __gen_ocl_get_global_offset2) +DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim) // Math function DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_fabs) diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h index 446007a..c954929 100644 --- a/backend/src/ocl_stdlib.h +++ b/backend/src/ocl_stdlib.h @@ -346,8 +346,11 @@ DEC(16); ///////////////////////////////////////////////////////////////////////////// // Work Items functions (see 6.11.1 of OCL 1.1 spec) ///////////////////////////////////////////////////////////////////////////// -// TODO get_global_offset -// TODO get_work_dim + +PURE CONST uint __gen_ocl_get_work_dim(void); +INLINE uint get_work_dim(void) { + return __gen_ocl_get_work_dim(); +} #define DECL_INTERNAL_WORK_ITEM_FN(NAME) \ PURE CONST unsigned int __gen_ocl_##NAME##0(void); \ @@ -357,6 +360,7 @@ DECL_INTERNAL_WORK_ITEM_FN(get_group_id) DECL_INTERNAL_WORK_ITEM_FN(get_local_id) DECL_INTERNAL_WORK_ITEM_FN(get_local_size) DECL_INTERNAL_WORK_ITEM_FN(get_global_size) +DECL_INTERNAL_WORK_ITEM_FN(get_global_offset) DECL_INTERNAL_WORK_ITEM_FN(get_num_groups) #undef DECL_INTERNAL_WORK_ITEM_FN @@ -371,6 +375,7 @@ DECL_PUBLIC_WORK_ITEM_FN(get_group_id) DECL_PUBLIC_WORK_ITEM_FN(get_local_id) DECL_PUBLIC_WORK_ITEM_FN(get_local_size) DECL_PUBLIC_WORK_ITEM_FN(get_global_size) +DECL_PUBLIC_WORK_ITEM_FN(get_global_offset) DECL_PUBLIC_WORK_ITEM_FN(get_num_groups) #undef DECL_PUBLIC_WORK_ITEM_FN diff --git a/src/cl_api.c b/src/cl_api.c index 9c5943b..5ef95b7 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -1453,6 +1453,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, /* Do device specific checks are enqueue the kernel */ err = cl_command_queue_ND_range(command_queue, kernel, + work_dim, fixed_global_off, fixed_global_sz, fixed_local_sz); diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 1a37c78..a3987d8 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -356,7 +356,7 @@ error: } #endif -extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, const size_t *, const size_t *, const size_t *); +extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *); static cl_int cl_kernel_check_args(cl_kernel k) @@ -371,6 +371,7 @@ cl_kernel_check_args(cl_kernel k) LOCAL cl_int cl_command_queue_ND_range(cl_command_queue queue, cl_kernel k, + const uint32_t work_dim, const size_t *global_wk_off, const size_t *global_wk_sz, const size_t *local_wk_sz) @@ -394,7 +395,7 @@ cl_command_queue_ND_range(cl_command_queue queue, #endif /* USE_FULSIM */ if (ver == 7 || ver == 75) - TRY (cl_command_queue_ND_range_gen7, queue, k, global_wk_off, global_wk_sz, local_wk_sz); + TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz); else FATAL ("Unknown Gen Device"); diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h index f0c00f4..5a792a2 100644 --- a/src/cl_command_queue.h +++ b/src/cl_command_queue.h @@ -54,6 +54,7 @@ extern void cl_command_queue_add_ref(cl_command_queue); /* Map ND range kernel from OCL API */ extern cl_int cl_command_queue_ND_range(cl_command_queue queue, cl_kernel ker, + const uint32_t work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size); diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index bc1a322..5950334 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -98,6 +98,7 @@ error: /* Will return the total amount of slm used */ static int32_t cl_curbe_fill(cl_kernel ker, + const uint32_t work_dim, const size_t *global_wk_off, const size_t *global_wk_sz, const size_t *local_wk_sz, @@ -120,6 +121,7 @@ cl_curbe_fill(cl_kernel ker, UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]); UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]); UPLOAD(GBE_CURBE_THREAD_NUM, thread_n); + UPLOAD(GBE_CURBE_WORK_DIM, work_dim); UPLOAD(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0) + 32); #undef UPLOAD @@ -185,6 +187,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker) LOCAL cl_int cl_command_queue_ND_range_gen7(cl_command_queue queue, cl_kernel ker, + const uint32_t work_dim, const size_t *global_wk_off, const size_t *global_wk_sz, const size_t *local_wk_sz) @@ -214,7 +217,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, /* Curbe step 1: fill the constant buffer data shared by all threads */ if (ker->curbe) - kernel.slm_sz = cl_curbe_fill(ker, global_wk_off, global_wk_sz, local_wk_sz, thread_n); + kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n); /* Setup the kernel */ cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32); -- 2.7.4