We use the floatn's assigment to do the copy.
128 pattern size is according to double16, and because
the double problem on our platform, we use to float16
to handle this.
unaligned cases is not optimized now, just use the char
assigment.
Signed-off-by: Junyan He <junyan.he@linux.intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
}
cl_int
+clEnqueueFillBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ const void * pattern,
+ size_t pattern_size,
+ size_t offset,
+ size_t size,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+ static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
+ int i = 0;
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(buffer);
+
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (offset < 0 || offset + size > buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (pattern == NULL) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ for (i = 0; i < sizeof(valid_sz) / sizeof(size_t); i++) {
+ if (valid_sz[i] == pattern_size)
+ break;
+ }
+ if (i == sizeof(valid_sz) / sizeof(size_t)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (offset % pattern_size || size % pattern_size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ err = cl_mem_fill(command_queue, pattern, pattern_size, buffer, offset, size);
+ if (err) {
+ goto error;
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueFillBuffer;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
+ err = cl_command_queue_flush(command_queue);
+ }
+
+ if(b_output_kernel_perf)
+ time_end(command_queue->ctx, "beignet internal kernel : cl_fill_buffer", "", command_queue);
+
+ return 0;
+
+ error:
+ return err;
+}
+
+cl_int
clEnqueueCopyBuffer(cl_command_queue command_queue,
cl_mem src_buffer,
cl_mem dst_buffer,
LOCAL void
cl_context_delete(cl_context ctx)
{
+ int i = 0;
if (UNLIKELY(ctx == NULL))
return;
if (atomic_dec(&ctx->ref_n) > 1)
return;
+ /* delete the internal programs. */
+ for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
+ if (ctx->internel_kernels[i]) {
+ cl_kernel_delete(ctx->internel_kernels[i]);
+ ctx->internel_kernels[i] = NULL;
+
+ assert(ctx->internal_prgs[i]);
+ cl_program_delete(ctx->internal_prgs[i]);
+ ctx->internal_prgs[i] = NULL;
+ }
+ }
+
/* All object lists should have been freed. Otherwise, the reference counter
* of the context cannot be 0
*/
cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kernel, const char * str_option)
{
cl_int ret;
- if (!ctx->internal_prgs[index])
- {
+ if (!ctx->internal_prgs[index]) {
size_t length = strlen(str_kernel) + 1;
ctx->internal_prgs[index] = cl_program_create_from_source(ctx, 1, &str_kernel, &length, NULL);
ctx->internal_prgs[index]->is_built = 1;
- ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+ /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
+ if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+ int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
+ for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
+ if (index != i) {
+ assert(ctx->internal_prgs[i] == NULL);
+ assert(ctx->internel_kernels[i] == NULL);
+ cl_program_add_ref(ctx->internal_prgs[index]);
+ ctx->internal_prgs[i] = ctx->internal_prgs[index];
+ }
+
+ if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_2", NULL);
+ } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_4", NULL);
+ } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_8", NULL);
+ } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_16", NULL);
+ } else
+ assert(0);
+ }
+ } else {
+ ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+ }
}
return ctx->internel_kernels[index];
{
cl_int ret;
cl_int binary_status = CL_SUCCESS;
- if (!ctx->internal_prgs[index])
- {
+ if (!ctx->internal_prgs[index]) {
ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
&size, (const unsigned char **)&str_kernel, &binary_status, &ret);
ctx->internal_prgs[index]->is_built = 1;
- ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+ /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
+ if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+ int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
+ for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
+ if (index != i) {
+ assert(ctx->internal_prgs[i] == NULL);
+ assert(ctx->internel_kernels[i] == NULL);
+ cl_program_add_ref(ctx->internal_prgs[index]);
+ ctx->internal_prgs[i] = ctx->internal_prgs[index];
+ }
+
+ if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_2", NULL);
+ } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_4", NULL);
+ } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_8", NULL);
+ } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_16", NULL);
+ } else
+ assert(0);
+ }
+ } else {
+ ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+ }
}
return ctx->internel_kernels[index];
};
enum _cl_internal_ker_type {
+ CL_INTERNAL_KERNEL_MIN = 0,
CL_ENQUEUE_COPY_BUFFER_ALIGN4 = 0,
CL_ENQUEUE_COPY_BUFFER_ALIGN16,
CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET,
CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, //copy image 3d tobuffer
CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D, //copy buffer to image 2d
CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D, //copy buffer to image 3d
+ CL_ENQUEUE_FILL_BUFFER_UNALIGN, //fill buffer with 1 aligne pattern, pattern size=1
+ CL_ENQUEUE_FILL_BUFFER_ALIGN2, //fill buffer with 2 aligne pattern, pattern size=2
+ CL_ENQUEUE_FILL_BUFFER_ALIGN4, //fill buffer with 4 aligne pattern, pattern size=4
+ CL_ENQUEUE_FILL_BUFFER_ALIGN8_8, //fill buffer with 8 aligne pattern, pattern size=8
+ CL_ENQUEUE_FILL_BUFFER_ALIGN8_16, //fill buffer with 16 aligne pattern, pattern size=16
+ CL_ENQUEUE_FILL_BUFFER_ALIGN8_32, //fill buffer with 16 aligne pattern, pattern size=32
+ CL_ENQUEUE_FILL_BUFFER_ALIGN8_64, //fill buffer with 16 aligne pattern, pattern size=64
+ CL_ENQUEUE_FILL_BUFFER_ALIGN128, //fill buffer with 128 aligne pattern, pattern size=128
CL_INTERNAL_KERNEL_MAX
};
case EnqueueCopyBufferToImage:
case EnqueueCopyImageToBuffer:
case EnqueueNDRangeKernel:
+ case EnqueueFillBuffer:
cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr);
return CL_SUCCESS;
case EnqueueNativeKernel:
EnqueueNDRangeKernel,
EnqueueNativeKernel,
EnqueueMarker,
+ EnqueueFillBuffer,
EnqueueInvalid
} enqueue_type;
{
switch(type) {
case CL_COMMAND_COPY_BUFFER:
+ case CL_COMMAND_FILL_BUFFER:
case CL_COMMAND_COPY_IMAGE:
case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
}
LOCAL cl_int
+cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+ cl_mem buffer, size_t offset, size_t size)
+{
+ cl_int ret = CL_SUCCESS;
+ cl_kernel ker = NULL;
+ size_t global_off[] = {0,0,0};
+ size_t global_sz[] = {1,1,1};
+ size_t local_sz[] = {1,1,1};
+ char pattern_comb[4];
+ int is_128 = 0;
+ const void * pattern1 = NULL;
+
+ assert(offset % pattern_size == 0);
+ assert(size % pattern_size == 0);
+
+ if (!size)
+ return ret;
+
+ if (pattern_size == 128) {
+ /* 128 is according to pattern of double16, but double works not very
+ well on some platform. We use two float16 to handle this. */
+ extern char cl_internal_fill_buf_align128_str[];
+ extern int cl_internal_fill_buf_align128_str_size;
+
+ ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN128,
+ cl_internal_fill_buf_align128_str, (size_t)cl_internal_fill_buf_align128_str_size, NULL);
+ is_128 = 1;
+ pattern_size = pattern_size / 2;
+ pattern1 = pattern + pattern_size;
+ size = size / 2;
+ } else if (pattern_size % 8 == 0) { /* Handle the 8 16 32 64 cases here. */
+ extern char cl_internal_fill_buf_align8_str[];
+ extern int cl_internal_fill_buf_align8_str_size;
+ int order = ffs(pattern_size / 8) - 1;
+
+ ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 + order,
+ cl_internal_fill_buf_align8_str, (size_t)cl_internal_fill_buf_align8_str_size, NULL);
+ } else if (pattern_size == 4) {
+ extern char cl_internal_fill_buf_align4_str[];
+ extern int cl_internal_fill_buf_align4_str_size;
+
+ ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
+ cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
+ } else if (size >= 4 && size % 4 == 0 && offset % 4 == 0) {
+ /* The unaligned case. But if copy size and offset are aligned to 4, we can fake
+ the pattern with the pattern duplication fill in. */
+ assert(pattern_size == 1 || pattern_size == 2);
+ extern char cl_internal_fill_buf_align4_str[];
+ extern int cl_internal_fill_buf_align4_str_size;
+
+ if (pattern_size == 2) {
+ memcpy(pattern_comb, pattern, sizeof(char)*2);
+ memcpy(pattern_comb + 2, pattern, sizeof(char)*2);
+ } else {
+ pattern_comb[0] = pattern_comb[1] = pattern_comb[2]
+ = pattern_comb[3] = *(char *)pattern;
+ }
+
+ ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
+ cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
+ pattern_size = 4;
+ pattern = pattern_comb;
+ }
+ //TODO: Unaligned cases, we may need to optimize it as cl_mem_copy, using mask in kernel
+ //functions. This depend on the usage but now we just use aligned 1 and 2.
+ else if (pattern_size == 2) {
+ extern char cl_internal_fill_buf_align2_str[];
+ extern int cl_internal_fill_buf_align2_str_size;
+ ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN2,
+ cl_internal_fill_buf_align2_str, (size_t)cl_internal_fill_buf_align2_str_size, NULL);
+ } else if (pattern_size == 1) {
+ extern char cl_internal_fill_buf_unalign_str[];
+ extern int cl_internal_fill_buf_unalign_str_size;
+ ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_UNALIGN,
+ cl_internal_fill_buf_unalign_str, (size_t)cl_internal_fill_buf_unalign_str_size, NULL);
+ } else
+ assert(0);
+
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ size = size / pattern_size;
+ offset = offset / pattern_size;
+
+ if (size < LOCAL_SZ_0) {
+ local_sz[0] = 1;
+ } else {
+ local_sz[0] = LOCAL_SZ_0;
+ }
+ global_sz[0] = ((size + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0;
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &buffer);
+ cl_kernel_set_arg(ker, 1, pattern_size, pattern);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_uint), &offset);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_uint), &size);
+ if (is_128)
+ cl_kernel_set_arg(ker, 4, pattern_size, pattern1);
+
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ return ret;
+}
+
+LOCAL cl_int
cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
const size_t *src_origin, const size_t *dst_origin, const size_t *region,
size_t src_row_pitch, size_t src_slice_pitch,
extern cl_int cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
size_t src_offset, size_t dst_offset, size_t cb);
+extern cl_int cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+ cl_mem buffer, size_t offset, size_t size);
+
/* api clEnqueueCopyBufferRect help function */
extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
const size_t *, const size_t *, const size_t *,