Implement the clEnqueueFillBuffer API.
authorJunyan He <junyan.he@linux.intel.com>
Fri, 13 Jun 2014 05:30:42 +0000 (13:30 +0800)
committerZhigang Gong <zhigang.gong@intel.com>
Fri, 13 Jun 2014 05:50:39 +0000 (13:50 +0800)
We use the floatn's assigment to do the copy.
128 pattern size is according to double16, and because
the double problem on our platform, we use to float16
to handle this.
unaligned cases is not optimized now, just use the char
assigment.

Signed-off-by: Junyan He <junyan.he@linux.intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
src/cl_api.c
src/cl_context.c
src/cl_context.h
src/cl_enqueue.c
src/cl_enqueue.h
src/cl_event.c
src/cl_mem.c
src/cl_mem.h

index c8d3cee..0e28f6b 100644 (file)
@@ -1602,6 +1602,84 @@ error:
 }
 
 cl_int
+clEnqueueFillBuffer(cl_command_queue   command_queue,
+                    cl_mem             buffer,
+                    const void *       pattern,
+                    size_t             pattern_size,
+                    size_t             offset,
+                    size_t             size,
+                    cl_uint            num_events_in_wait_list,
+                    const cl_event *   event_wait_list,
+                    cl_event *         event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+  static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
+  int i = 0;
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(buffer);
+
+  if (command_queue->ctx != buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (offset < 0 || offset + size > buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (pattern == NULL) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  for (i = 0; i < sizeof(valid_sz) / sizeof(size_t); i++) {
+    if (valid_sz[i] == pattern_size)
+      break;
+  }
+  if (i == sizeof(valid_sz) / sizeof(size_t)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (offset % pattern_size || size % pattern_size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  err = cl_mem_fill(command_queue, pattern, pattern_size, buffer, offset, size);
+  if (err) {
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueFillBuffer;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+        && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
+    err = cl_command_queue_flush(command_queue);
+  }
+
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_fill_buffer", "", command_queue);
+
+  return 0;
+
+ error:
+  return err;
+}
+
+cl_int
 clEnqueueCopyBuffer(cl_command_queue     command_queue,
                     cl_mem               src_buffer,
                     cl_mem               dst_buffer,
index 1911bf2..a0da7b0 100644 (file)
@@ -188,6 +188,7 @@ error:
 LOCAL void
 cl_context_delete(cl_context ctx)
 {
+  int i = 0;
   if (UNLIKELY(ctx == NULL))
     return;
 
@@ -195,6 +196,18 @@ cl_context_delete(cl_context ctx)
   if (atomic_dec(&ctx->ref_n) > 1)
     return;
 
+  /* delete the internal programs. */
+  for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
+    if (ctx->internel_kernels[i]) {
+      cl_kernel_delete(ctx->internel_kernels[i]);
+      ctx->internel_kernels[i] = NULL;
+
+      assert(ctx->internal_prgs[i]);
+      cl_program_delete(ctx->internal_prgs[i]);
+      ctx->internal_prgs[i] = NULL;
+    }
+  }
+
   /* All object lists should have been freed. Otherwise, the reference counter
    * of the context cannot be 0
    */
@@ -250,8 +263,7 @@ cl_kernel
 cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kernel, const char * str_option)
 {
   cl_int ret;
-  if (!ctx->internal_prgs[index])
-  {
+  if (!ctx->internal_prgs[index]) {
     size_t length = strlen(str_kernel) + 1;
     ctx->internal_prgs[index] = cl_program_create_from_source(ctx, 1, &str_kernel, &length, NULL);
 
@@ -264,7 +276,35 @@ cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kern
 
     ctx->internal_prgs[index]->is_built = 1;
 
-    ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+    /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
+    if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+      int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
+      for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
+        if (index != i) {
+          assert(ctx->internal_prgs[i] == NULL);
+          assert(ctx->internel_kernels[i] == NULL);
+          cl_program_add_ref(ctx->internal_prgs[index]);
+          ctx->internal_prgs[i] = ctx->internal_prgs[index];
+        }
+
+        if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_2", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_4", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_8", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_16", NULL);
+        } else
+          assert(0);
+      }
+    } else {
+      ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+    }
   }
 
   return ctx->internel_kernels[index];
@@ -276,8 +316,7 @@ cl_context_get_static_kernel_form_bin(cl_context ctx, cl_int index,
 {
   cl_int ret;
   cl_int binary_status = CL_SUCCESS;
-  if (!ctx->internal_prgs[index])
-  {
+  if (!ctx->internal_prgs[index]) {
     ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
       &size, (const unsigned char **)&str_kernel, &binary_status, &ret);
 
@@ -290,7 +329,35 @@ cl_context_get_static_kernel_form_bin(cl_context ctx, cl_int index,
 
     ctx->internal_prgs[index]->is_built = 1;
 
-    ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+    /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
+    if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+      int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
+      for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
+        if (index != i) {
+          assert(ctx->internal_prgs[i] == NULL);
+          assert(ctx->internel_kernels[i] == NULL);
+          cl_program_add_ref(ctx->internal_prgs[index]);
+          ctx->internal_prgs[i] = ctx->internal_prgs[index];
+        }
+
+        if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_2", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_4", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_8", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_16", NULL);
+        } else
+          assert(0);
+      }
+    } else {
+      ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+    }
   }
 
   return ctx->internel_kernels[index];
index 82d3217..65b1728 100644 (file)
@@ -40,6 +40,7 @@ enum _cl_gl_context_type {
 };
 
 enum _cl_internal_ker_type {
+  CL_INTERNAL_KERNEL_MIN = 0,
   CL_ENQUEUE_COPY_BUFFER_ALIGN4 = 0,
   CL_ENQUEUE_COPY_BUFFER_ALIGN16,
   CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET,
@@ -54,6 +55,14 @@ enum _cl_internal_ker_type {
   CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
+  CL_ENQUEUE_FILL_BUFFER_UNALIGN,      //fill buffer with 1 aligne pattern, pattern size=1
+  CL_ENQUEUE_FILL_BUFFER_ALIGN2,       //fill buffer with 2 aligne pattern, pattern size=2
+  CL_ENQUEUE_FILL_BUFFER_ALIGN4,       //fill buffer with 4 aligne pattern, pattern size=4
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_8,     //fill buffer with 8 aligne pattern, pattern size=8
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_16,    //fill buffer with 16 aligne pattern, pattern size=16
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_32,    //fill buffer with 16 aligne pattern, pattern size=32
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_64,    //fill buffer with 16 aligne pattern, pattern size=64
+  CL_ENQUEUE_FILL_BUFFER_ALIGN128,     //fill buffer with 128 aligne pattern, pattern size=128
   CL_INTERNAL_KERNEL_MAX
 };
 
index 800668d..4ec26c0 100644 (file)
@@ -421,6 +421,7 @@ cl_int cl_enqueue_handle(cl_event event, enqueue_data* data)
     case EnqueueCopyBufferToImage:
     case EnqueueCopyImageToBuffer:
     case EnqueueNDRangeKernel:
+    case EnqueueFillBuffer:
       cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr);
       return CL_SUCCESS;
     case EnqueueNativeKernel:
index c7e33da..99c75ec 100644 (file)
@@ -41,6 +41,7 @@ typedef enum {
   EnqueueNDRangeKernel,
   EnqueueNativeKernel,
   EnqueueMarker,
+  EnqueueFillBuffer,
   EnqueueInvalid
 } enqueue_type;
 
index bcd4a6f..2325675 100644 (file)
@@ -33,6 +33,7 @@ cl_event_is_gpu_command_type(cl_command_type type)
 {
   switch(type) {
     case CL_COMMAND_COPY_BUFFER:
+    case CL_COMMAND_FILL_BUFFER:
     case CL_COMMAND_COPY_IMAGE:
     case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
     case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
index 2f03935..f0fd372 100644 (file)
@@ -938,6 +938,108 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
 }
 
 LOCAL cl_int
+cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+            cl_mem buffer, size_t offset, size_t size)
+{
+  cl_int ret = CL_SUCCESS;
+  cl_kernel ker = NULL;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {1,1,1};
+  char pattern_comb[4];
+  int is_128 = 0;
+  const void * pattern1 = NULL;
+
+  assert(offset % pattern_size == 0);
+  assert(size % pattern_size == 0);
+
+  if (!size)
+    return ret;
+
+  if (pattern_size == 128) {
+    /* 128 is according to pattern of double16, but double works not very
+       well on some platform. We use two float16 to handle this. */
+    extern char cl_internal_fill_buf_align128_str[];
+    extern int cl_internal_fill_buf_align128_str_size;
+
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN128,
+               cl_internal_fill_buf_align128_str, (size_t)cl_internal_fill_buf_align128_str_size, NULL);
+    is_128 = 1;
+    pattern_size = pattern_size / 2;
+    pattern1 = pattern + pattern_size;
+    size = size / 2;
+  } else if (pattern_size % 8 == 0) { /* Handle the 8 16 32 64 cases here. */
+    extern char cl_internal_fill_buf_align8_str[];
+    extern int cl_internal_fill_buf_align8_str_size;
+    int order = ffs(pattern_size / 8) - 1;
+
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 + order,
+               cl_internal_fill_buf_align8_str, (size_t)cl_internal_fill_buf_align8_str_size, NULL);
+  } else if (pattern_size == 4) {
+    extern char cl_internal_fill_buf_align4_str[];
+    extern int cl_internal_fill_buf_align4_str_size;
+
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
+               cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
+  } else if (size >= 4 && size % 4 == 0 && offset % 4 == 0) {
+    /* The unaligned case. But if copy size and offset are aligned to 4, we can fake
+       the pattern with the pattern duplication fill in. */
+    assert(pattern_size == 1 || pattern_size == 2);
+    extern char cl_internal_fill_buf_align4_str[];
+    extern int cl_internal_fill_buf_align4_str_size;
+
+    if (pattern_size == 2) {
+      memcpy(pattern_comb, pattern, sizeof(char)*2);
+      memcpy(pattern_comb + 2, pattern, sizeof(char)*2);
+    } else {
+      pattern_comb[0] = pattern_comb[1] = pattern_comb[2]
+        = pattern_comb[3] = *(char *)pattern;
+    }
+
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
+               cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
+    pattern_size = 4;
+    pattern = pattern_comb;
+  }
+  //TODO: Unaligned cases, we may need to optimize it as cl_mem_copy, using mask in kernel
+  //functions. This depend on the usage but now we just use aligned 1 and 2.
+  else if (pattern_size == 2) {
+    extern char cl_internal_fill_buf_align2_str[];
+    extern int cl_internal_fill_buf_align2_str_size;
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN2,
+               cl_internal_fill_buf_align2_str, (size_t)cl_internal_fill_buf_align2_str_size, NULL);
+  } else if (pattern_size == 1) {
+    extern char cl_internal_fill_buf_unalign_str[];
+    extern int cl_internal_fill_buf_unalign_str_size;
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_UNALIGN,
+               cl_internal_fill_buf_unalign_str, (size_t)cl_internal_fill_buf_unalign_str_size, NULL);
+  } else
+    assert(0);
+
+  if (!ker)
+    return CL_OUT_OF_RESOURCES;
+
+  size = size / pattern_size;
+  offset = offset / pattern_size;
+
+  if (size < LOCAL_SZ_0) {
+    local_sz[0] = 1;
+  } else {
+    local_sz[0] = LOCAL_SZ_0;
+  }
+  global_sz[0] = ((size + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0;
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &buffer);
+  cl_kernel_set_arg(ker, 1, pattern_size, pattern);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_uint), &offset);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_uint), &size);
+  if (is_128)
+    cl_kernel_set_arg(ker, 4, pattern_size, pattern1);
+
+  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+  return ret;
+}
+
+LOCAL cl_int
 cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
                        const size_t *src_origin, const size_t *dst_origin, const size_t *region,
                        size_t src_row_pitch, size_t src_slice_pitch,
index 5719c60..c0f902e 100644 (file)
@@ -201,6 +201,9 @@ extern void cl_mem_add_ref(cl_mem);
 extern cl_int cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
               size_t src_offset, size_t dst_offset, size_t cb);
 
+extern cl_int cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+              cl_mem buffer, size_t offset, size_t size);
+
 /* api clEnqueueCopyBufferRect help function */
 extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
                                      const size_t *, const size_t *, const size_t *,