Currently, simply allocate enough graphics memory as constant memory space.
And bind it to bti 2. Constant cache read are backed by dword scatter read.
Different from other data port messages, the address need to be dword aligned,
and the addresses are in units of dword.
The constant address space data are placed in order: first global constant,
then the constant buffer kernel argument.
v2: change function & variable naming, to make clear 'curbe' and 'constant buffer'
Signed-off-by: Ruiling Song <ruiling.song@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
}
});
#undef INSERT_REG
- this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, 0, sizeof(int));
- specialRegs.insert(ir::ocl::constoffst);
-
- // Insert serialized global constant arrays if used
- const ir::ConstantSet& constantSet = unit.getConstantSet();
- if (constantSet.getConstantNum()) {
- size_t size = constantSet.getDataSize();
- this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_DATA, 0, size);
- }
// Insert the number of threads
this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t));
reg == ir::ocl::goffset0 ||
reg == ir::ocl::goffset1 ||
reg == ir::ocl::goffset2 ||
- reg == ir::ocl::workdim ||
- reg == ir::ocl::constoffst)
+ reg == ir::ocl::workdim)
return true;
return false;
}
sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
}
+ void emitDWordGather(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
+ GenRegister addr,
+ uint32_t bti) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = insn.getValueNum();
+ const uint32_t simdWidth = sel.ctx.getSimdWidth();
+ GBE_ASSERT(valueNum == 1);
+ GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F);
+ // get dword based address
+ GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+ sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
+
+ sel.DWORD_GATHER(dst, addrDW, bti);
+ }
+
void emitRead64(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister addr,
GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false);
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(type);
- if (insn.getAddressSpace() == MEM_CONSTANT)
- this->emitIndirectMove(sel, insn, address);
+ if (insn.getAddressSpace() == MEM_CONSTANT) {
+ // XXX TODO read 64bit constant through constant cache
+ // Per HW Spec, constant cache messages can read at least DWORD data.
+ // So, byte/short data type, we have to read through data cache.
+ if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+ this->emitRead64(sel, insn, address, 0x2);
+ else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+ this->emitDWordGather(sel, insn, address, 0x2);
+ else {
+ const GenRegister value = sel.selReg(insn.getValue(0));
+ this->emitByteGather(sel, insn, elemSize, address, value, 0x2);
+ }
+ }
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
this->emitRead64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2);
allocatePayloadReg(GBE_CURBE_STACK_POINTER, ocl::stackptr);
allocatePayloadReg(GBE_CURBE_THREAD_NUM, ocl::threadn);
- allocatePayloadReg(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, ocl::constoffst);
// Group and barrier IDs are always allocated by the hardware in r0
RA.insert(std::make_pair(ocl::groupid0, 1*sizeof(float))); // r0.1
GBE_CURBE_GROUP_NUM_Y,
GBE_CURBE_GROUP_NUM_Z,
GBE_CURBE_WORK_DIM,
- GBE_CURBE_GLOBAL_CONSTANT_OFFSET,
- GBE_CURBE_GLOBAL_CONSTANT_DATA,
GBE_CURBE_IMAGE_INFO,
GBE_CURBE_STACK_POINTER,
GBE_CURBE_KERNEL_ARGUMENT,
"stack_pointer",
"block_ip",
"barrier_id", "thread_number",
- "const_curbe_offset",
"work_dimension",
};
DECL_NEW_REG(FAMILY_WORD, blockip);
DECL_NEW_REG(FAMILY_DWORD, barrierid);
DECL_NEW_REG(FAMILY_DWORD, threadn);
- DECL_NEW_REG(FAMILY_DWORD, constoffst);
DECL_NEW_REG(FAMILY_DWORD, workdim);
}
#undef DECL_NEW_REG
static const Register blockip = Register(19); // blockip
static const Register barrierid = Register(20);// barrierid
static const Register threadn = Register(21); // number of threads
- static const Register constoffst = Register(22); // offset of global constant array's curbe
- static const Register workdim = Register(23); // work dimention.
- static const uint32_t regNum = 24; // number of special registers
+ static const Register workdim = Register(22); // work dimention.
+ static const uint32_t regNum = 23; // number of special registers
extern const char *specialRegMean[]; // special register name.
} /* namespace ocl */
ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
ir::Constant &con = unit.getConstantSet().getConstant(j ++);
con.setReg(reg.value());
- if(con.getOffset() != 0) {
- ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
- ctx.ADD(ir::TYPE_S32, reg, ir::ocl::constoffst, reg);
- } else {
- ctx.MOV(ir::TYPE_S32, reg, ir::ocl::constoffst);
- }
+ ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
}
// Visit all the instructions and emit the IR registers or the value to
const ir::Type type = getType(ctx, elemType);
const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
- if (type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) {
+ if ((type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) && addrSpace != ir::MEM_CONSTANT) {
// One message is enough here. Nothing special to do
if (elemNum <= 4) {
// Build the tuple data in the vector
return CL_SUCCESS;
}
-LOCAL cl_int cl_command_queue_upload_constant_buffer(cl_kernel k,
- char * dst)
-{
- int i;
- for(i = 0; i < k->arg_n; i++) {
- enum gbe_arg_type arg_type = gbe_kernel_get_arg_type(k->opaque, i);
-
- if(arg_type == GBE_ARG_CONSTANT_PTR && k->args[i].mem) {
- uint32_t offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_EXTRA_ARGUMENT, i+GBE_CONSTANT_BUFFER);
- cl_mem mem = k->args[i].mem;
- cl_buffer_map(mem->bo, 1);
- void * addr = cl_buffer_get_virtual(mem->bo);
- memcpy(dst + offset, addr, mem->size);
- cl_buffer_unmap(mem->bo);
- }
- }
- return CL_SUCCESS;
-}
#if USE_FULSIM
extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr);
/* Bind all the image surfaces in the GPGPU state */
extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel);
-
-/*update constant buffer to final curbe */
-extern cl_int cl_command_queue_upload_constant_buffer(cl_kernel k, char * dst);
#endif /* __CL_COMMAND_QUEUE_H__ */
block_ips[curr] = 0;
}
- /* Copy them to the constant buffer */
+ /* Copy them to the curbe buffer */
curr = 0;
for (i = 0; i < thread_n; ++i, data += cst_sz) {
uint32_t *ids0 = (uint32_t *) (data + id_offset[0]);
return err;
}
+static void
+cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
+{
+ /* calculate constant buffer size */
+ int32_t arg;
+ size_t offset;
+ gbe_program prog = ker->program->opaque;
+ const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
+ size_t global_const_size = gbe_program_get_global_constant_size(prog);
+ uint32_t constant_buf_size = 0;
+ for (arg = 0; arg < arg_n; ++arg) {
+ const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
+ if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+ cl_mem mem = ker->args[arg].mem;
+ constant_buf_size += ALIGN(mem->size, 4);
+ }
+ }
+ if(global_const_size == 0 && constant_buf_size == 0)
+ return;
+
+ cl_buffer bo = cl_gpgpu_alloc_constant_buffer(queue->gpgpu, constant_buf_size + global_const_size + 4);
+ cl_buffer_map(bo, 1);
+ char * cst_addr = cl_buffer_get_virtual(bo);
+ offset = 0;
+ if (global_const_size > 0) {
+ /* Write the global constant arrays */
+ gbe_program_get_global_constant_data(prog, (char*)(cst_addr+offset));
+ }
+ offset += ALIGN(global_const_size, 4);
+
+ if(global_const_size == 0) {
+ /* reserve 4 bytes to get rid of 0 address */
+ offset += 4;
+ }
+
+ /* upload constant buffer argument */
+ int32_t curbe_offset = 0;
+ for (arg = 0; arg < arg_n; ++arg) {
+ const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
+ if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+ cl_mem mem = ker->args[arg].mem;
+
+ curbe_offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
+ assert(curbe_offset >= 0);
+ *(uint32_t *) (ker->curbe + curbe_offset) = offset;
+
+ cl_buffer_map(mem->bo, 1);
+ void * addr = cl_buffer_get_virtual(mem->bo);
+ memcpy(cst_addr + offset, addr, mem->size);
+ cl_buffer_unmap(mem->bo);
+ offset += ALIGN(mem->size, 4);
+ }
+ }
+ cl_buffer_unmap(bo);
+}
+
/* Will return the total amount of slm used */
static int32_t
cl_curbe_fill(cl_kernel ker,
UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
- UPLOAD(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0) + 32);
#undef UPLOAD
/* Write identity for the stack pointer. This is required by the stack pointer
int32_t i;
for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
}
-
- /* Write global constant arrays */
- if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0)) >= 0) {
- /* Write the global constant arrays */
- gbe_program prog = ker->program->opaque;
- gbe_program_get_global_constant_data(prog, ker->curbe + offset);
- }
-
/* Handle the various offsets to SLM */
const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
int32_t arg, slm_offset = 0;
/* Compute the number of HW threads we need */
TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
- kernel.cst_sz = cst_sz;
+ kernel.curbe_sz = cst_sz;
- /* Curbe step 1: fill the constant buffer data shared by all threads */
+ /* Curbe step 1: fill the constant urb buffer data shared by all threads */
if (ker->curbe) {
kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
if (kernel.slm_sz > ker->program->ctx->device->local_mem_size)
cl_setup_scratch(gpgpu, ker);
/* Bind a stack if needed */
cl_bind_stack(gpgpu, ker);
+
+ cl_upload_constant_buffer(queue, ker);
+
cl_gpgpu_states_setup(gpgpu, &kernel);
/* Curbe step 2. Give the localID and upload it to video memory */
TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
for (i = 0; i < thread_n; ++i) {
memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
- cl_command_queue_upload_constant_buffer(ker, final_curbe + cst_sz * i);
}
TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
- cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
+ cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz);
}
/* Start a new batch buffer */
typedef struct cl_gpgpu_kernel {
const char *name; /* kernel name and bo name */
uint32_t grf_blocks; /* register blocks kernel wants (in 8 reg blocks) */
- uint32_t cst_sz; /* total size of all constants */
+ uint32_t curbe_sz; /* total size of all curbes */
cl_buffer bo; /* kernel code in the proper addr space */
int32_t barrierID; /* barrierID for _this_ kernel */
uint32_t use_slm:1; /* For gen7 (automatic barrier management) */
typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu, cl_buffer perf);
extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters;
-/* Fills current constant buffer with data */
-typedef void (cl_gpgpu_upload_constants_cb)(cl_gpgpu, const void* data, uint32_t size);
-extern cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants;
+/* Fills current curbe buffer with data */
+typedef void (cl_gpgpu_upload_curbes_cb)(cl_gpgpu, const void* data, uint32_t size);
+extern cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes;
+
+typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t size);
+extern cl_gpgpu_alloc_constant_buffer_cb *cl_gpgpu_alloc_constant_buffer;
/* Setup all indirect states */
typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu, cl_gpgpu_kernel *kernel);
LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
+LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
-LOCAL cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants = NULL;
+LOCAL cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes = NULL;
LOCAL cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup = NULL;
LOCAL cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers = NULL;
LOCAL cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset = NULL;
.single_fp_config = 0, /* XXX */
.global_mem_cache_type = CL_READ_WRITE_CACHE,
.global_mem_size = 128 * 1024 * 1024,
-.max_constant_buffer_size = 64 << 10,
+.max_constant_buffer_size = 512 << 10,
.max_constant_args = 8,
.error_correction_support = CL_FALSE,
.host_unified_memory = CL_FALSE,
mem = *(cl_mem*) value;
- if(arg_type == GBE_ARG_CONSTANT_PTR) {
- int32_t cbOffset;
- cbOffset = gbe_kernel_set_const_buffer_size(k->opaque, index, mem->size);
- //constant ptr's curbe offset changed, update it
- if(cbOffset >= 0) {
- offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
- *((uint32_t *)(k->curbe + offset)) = cbOffset; //cb offset in curbe
- }
- }
-
cl_mem_add_ref(mem);
if (k->args[index].mem)
cl_mem_delete(k->args[index].mem);
/* We use the first 2 slots(0,1) for all the bufs.
* Notify the gbe this base index, thus gbe can avoid conflicts
* when it allocates slots for images*/
- gbe_set_image_base_index(2);
+ gbe_set_image_base_index(3);
exit:
return driver;
error:
intel_batchbuffer_t *batch;
cl_gpgpu_kernel *ker;
drm_intel_bo *binded_buf[max_buf_n]; /* all buffers binded for the call */
- uint32_t binded_offset[max_buf_n]; /* their offsets in the constant buffer */
+ uint32_t binded_offset[max_buf_n]; /* their offsets in the curbe buffer */
uint32_t binded_n; /* number of buffers binded */
unsigned long img_bitmap; /* image usage bitmap. */
struct { drm_intel_bo *bo; } sampler_state_b;
struct { drm_intel_bo *bo; } perf_b;
struct { drm_intel_bo *bo; } scratch_b;
+ struct { drm_intel_bo *bo; } constant_b;
uint32_t per_thread_scratch;
struct {
if (gpgpu->scratch_b.bo)
drm_intel_bo_unreference(gpgpu->scratch_b.bo);
+ if(gpgpu->constant_b.bo)
+ drm_intel_bo_unreference(gpgpu->constant_b.bo);
+
intel_batchbuffer_delete(gpgpu->batch);
cl_free(gpgpu);
}
}
static void
-intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu)
+intel_gpgpu_load_curbe_buffer(intel_gpgpu_t *gpgpu)
{
BEGIN_BATCH(gpgpu->batch, 4);
OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */
intel_gpgpu_select_pipeline(gpgpu);
intel_gpgpu_set_base_address(gpgpu);
intel_gpgpu_load_vfe_state(gpgpu);
- intel_gpgpu_load_constant_buffer(gpgpu);
+ intel_gpgpu_load_curbe_buffer(gpgpu);
intel_gpgpu_load_idrt(gpgpu);
if (gpgpu->perf_b.bo) {
/* Binded buffers */
gpgpu->binded_n = 0;
gpgpu->img_bitmap = 0;
- gpgpu->img_index_base = 2;
+ gpgpu->img_index_base = 3;
gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1);
/* URB */
gpgpu->urb.size_cs_entry = size_cs_entry;
gpgpu->max_threads = max_threads;
- /* Constant buffer */
+ /* Constant URB buffer */
if(gpgpu->curbe_b.bo)
dri_bo_unreference(gpgpu->curbe_b.bo);
uint32_t size_cb = gpgpu->urb.num_cs_entries * gpgpu->urb.size_cs_entry * 64;
size_cb = ALIGN(size_cb, 4096);
- bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", size_cb, 64);
+ bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CURBE_BUFFER", size_cb, 64);
assert(bo);
gpgpu->curbe_b.bo = bo;
obj_bo);
}
+static dri_bo*
+intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size)
+{
+ uint32_t s = size - 1;
+ assert(size != 0);
+
+ surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
+ gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[2];
+ memset(ss2, 0, sizeof(gen7_surface_state_t));
+ ss2->ss0.surface_type = I965_SURFACE_BUFFER;
+ ss2->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+ ss2->ss2.width = s & 0x7f; /* bits 6:0 of sz */
+ ss2->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+ ss2->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
+ ss2->ss5.cache_control = cc_llc_l3;
+ heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t);
+
+ if(gpgpu->constant_b.bo)
+ dri_bo_unreference(gpgpu->constant_b.bo);
+ gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", s, 64);
+ assert(gpgpu->constant_b.bo);
+ ss2->ss1.base_addr = gpgpu->constant_b.bo->offset;
+ dri_bo_emit_reloc(gpgpu->surface_heap_b.bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ 0,
+ heap->binding_table[2] +
+ offsetof(gen7_surface_state_t, ss1),
+ gpgpu->constant_b.bo);
+ return gpgpu->constant_b.bo;
+}
+
+
/* Map address space with two 2GB surfaces. One surface for untyped message and
* one surface for byte scatters / gathers. Actually the HW does not require two
* surfaces but Fulsim complains
desc->desc2.sampler_state_pointer = gpgpu->sampler_state_b.bo->offset >> 5;
desc->desc3.binding_table_entry_count = 0; /* no prefetch */
desc->desc3.binding_table_pointer = 0;
- desc->desc4.curbe_read_len = kernel->cst_sz / 32;
+ desc->desc4.curbe_read_len = kernel->curbe_sz / 32;
desc->desc4.curbe_read_offset = 0;
/* Barriers / SLM are automatically handled on Gen7+ */
}
static void
-intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
+intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
{
unsigned char *curbe = NULL;
cl_gpgpu_kernel *k = gpgpu->ker;
/* Now put all the relocations for our flat address space */
for (i = 0; i < k->thread_n; ++i)
for (j = 0; j < gpgpu->binded_n; ++j) {
- *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->cst_sz) = gpgpu->binded_buf[j]->offset;
+ *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset;
drm_intel_bo_emit_reloc(gpgpu->curbe_b.bo,
- gpgpu->binded_offset[j]+i*k->cst_sz,
+ gpgpu->binded_offset[j]+i*k->curbe_sz,
gpgpu->binded_buf[j],
0,
I915_GEM_DOMAIN_RENDER,
cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
- cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) intel_gpgpu_upload_constants;
+ cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes;
+ cl_gpgpu_alloc_constant_buffer = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer;
cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset;