Add kernels performance output
authorYongjia Zhang <zhang_yong_jia@126.com>
Tue, 1 Apr 2014 09:16:46 +0000 (17:16 +0800)
committerZhigang Gong <zhigang.gong@intel.com>
Wed, 2 Apr 2014 02:54:40 +0000 (10:54 +0800)
if environment variable OCL_OUTPUT_KERNEL_PERF is set non-zero,
then after the executable program exits, beignet will output the
time information of each kernel executed.

v2:fixed the patch's trailing whitespace problem.

v3:if OCL_OUTPUT_KERNEL_PERF is 1, then the output will only
contains time summary, if it is 2, then the output will contain
time summary and detail. Add output 'Ave' and 'Dev', 'Ave' is
the average time per kernel per execution round, 'Dev' is the
result of 'Ave' divide a kernel's all executions' standard deviation.

Signed-off-by: Yongjia Zhang <yongjia.zhang@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
src/CMakeLists.txt
src/cl_api.c
src/cl_command_queue.c
src/performance.c [new file with mode: 0644]
src/performance.h [new file with mode: 0644]

index 95ff56f..4c34235 100644 (file)
@@ -47,7 +47,8 @@ set(OPENCL_SRC
     intel/intel_batchbuffer.c
     intel/intel_driver.c
     x11/dricommon.c 
-    x11/va_dri2.c)
+    x11/va_dri2.c
+    performance.c)
 
 if (EGL_FOUND AND MESA_SOURCE_FOUND)
 set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/mesa_egl_extension.c x11/mesa_egl_res_share.c intel/intel_dri_resource_sharing.c)
index 9638994..03a1cda 100644 (file)
@@ -40,6 +40,8 @@
 #include <assert.h>
 #include <unistd.h>
 
+#include "performance.h"
+
 #ifndef CL_VERSION_1_2
 #define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
 #define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
@@ -293,6 +295,7 @@ clCreateContext(const cl_context_properties *  properties,
                            pfn_notify,
                            user_data,
                            &err);
+  initialize_env_var();
 error:
   if (errcode_ret)
     *errcode_ret = err;
@@ -1676,6 +1679,10 @@ clEnqueueCopyBuffer(cl_command_queue     command_queue,
 
     err = cl_command_queue_flush(command_queue);
   }
+
+  if(b_output_kernel_perf)
+         time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", command_queue);
+
   return 0;
 
 error:
@@ -1777,6 +1784,9 @@ clEnqueueCopyBufferRect(cl_command_queue     command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", command_queue);
+
 error:
   return err;
 }
@@ -2016,6 +2026,9 @@ clEnqueueCopyImage(cl_command_queue      command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", command_queue);
+
 error:
   return err;
 }
@@ -2077,6 +2090,9 @@ clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", command_queue);
+
 error:
   return err;
 }
@@ -2138,6 +2154,9 @@ clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", command_queue);
+
 error:
   return err;
 }
@@ -2527,6 +2546,8 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, cl_kernel_get_name(kernel), command_queue);
 error:
   return err;
 }
index 4ac2e11..6a699c0 100644 (file)
@@ -28,6 +28,7 @@
 #include "cl_alloc.h"
 #include "cl_driver.h"
 #include "cl_khr_icd.h"
+#include "performance.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -376,6 +377,8 @@ cl_command_queue_ND_range(cl_command_queue queue,
                           const size_t *global_wk_sz,
                           const size_t *local_wk_sz)
 {
+  if(b_output_kernel_perf)
+    time_start(queue->ctx, cl_kernel_get_name(k), queue);
   const int32_t ver = cl_driver_get_ver(queue->ctx->drv);
   cl_int err = CL_SUCCESS;
 
diff --git a/src/performance.c b/src/performance.c
new file mode 100644 (file)
index 0000000..5248bb7
--- /dev/null
@@ -0,0 +1,277 @@
+#include <performance.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+#define MAX_KERNEL_NAME_LENGTH 100
+#define MAX_KERNEL_EXECUTION_COUNT 100000
+
+typedef struct kernel_storage_node
+{
+  char kernel_name[MAX_KERNEL_NAME_LENGTH];
+  float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
+  int current_count;
+  float kernel_sum_time;
+  struct kernel_storage_node *next;
+} kernel_storage_node;
+
+typedef struct context_storage_node
+{
+  uint64_t context_id;
+  kernel_storage_node *kernels_storage;
+  char max_time_kernel_name[MAX_KERNEL_NAME_LENGTH];
+  float kernel_max_time;
+  int kernel_count;
+  struct context_storage_node *next;
+} context_storage_node;
+
+typedef struct storage
+{
+  context_storage_node * context_storage;
+} storage;
+
+
+
+static storage record;
+static int atexit_registered = 0;
+
+
+static context_storage_node * prev_context_pointer = NULL;
+static kernel_storage_node * prev_kernel_pointer = NULL;
+
+static context_storage_node * find_context(cl_context context)
+{
+  if(NULL != prev_context_pointer )
+  {
+    if(prev_context_pointer->context_id == (uint64_t)context)
+      return prev_context_pointer;
+  }
+
+  if(NULL == record.context_storage)
+  {
+    record.context_storage = (context_storage_node *) malloc(sizeof(context_storage_node));
+    record.context_storage->context_id = (uint64_t)context;
+    record.context_storage->kernels_storage = NULL;
+    record.context_storage->kernel_max_time = 0.0f;
+    record.context_storage->next = NULL;
+    record.context_storage->kernel_count = 0;
+    return record.context_storage;
+  }
+
+  context_storage_node *pre = record.context_storage;
+  context_storage_node *cur = record.context_storage;
+  while(NULL !=cur && (uint64_t)context != cur->context_id )
+  {
+    pre = cur;
+    cur = cur->next;
+  }
+  if(NULL != cur)
+    return cur;
+
+  pre->next = (context_storage_node *)malloc(sizeof(context_storage_node));
+  pre = pre->next;
+  pre->context_id = (uint64_t)context;
+  pre->kernels_storage = NULL;
+  pre->kernel_max_time = 0.0f;
+  pre->next = NULL;
+  pre->kernel_count = 0;
+  return pre;
+}
+
+static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name)
+{
+  if(NULL != prev_kernel_pointer && NULL != prev_context_pointer &&
+     p_context == prev_context_pointer &&
+     !strcmp(kernel_name, prev_kernel_pointer->kernel_name))
+    return prev_kernel_pointer;
+
+  if(NULL == p_context)
+    return NULL;
+
+  if(NULL == p_context->kernels_storage)
+  {
+    p_context->kernels_storage = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
+    p_context->kernel_count++;
+    strcpy(p_context->kernels_storage->kernel_name,kernel_name);
+    p_context->kernels_storage->current_count = 0;
+    p_context->kernels_storage->kernel_sum_time = 0.0f;
+    p_context->kernels_storage->next = NULL;
+    return p_context->kernels_storage;
+  }
+  kernel_storage_node *pre = p_context->kernels_storage;
+  kernel_storage_node *cur = p_context->kernels_storage;
+  while(NULL != cur && strcmp(cur->kernel_name, kernel_name))
+  {
+    pre = cur;
+    cur = cur->next;
+  }
+  if(NULL != cur)
+  {
+    return cur;
+  }
+  p_context->kernel_count++;
+  pre->next = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
+  pre = pre->next;
+  pre->current_count = 0;
+  pre->kernel_sum_time = 0.0f;
+  pre->next = NULL;
+  strcpy(pre->kernel_name, kernel_name);
+  return pre;
+}
+
+static void free_storage()
+{
+  context_storage_node *p_context = record.context_storage;
+  while(NULL != p_context)
+  {
+    context_storage_node *p_tmp_context = p_context->next;
+    kernel_storage_node *p_kernel = p_context->kernels_storage;
+    while(NULL != p_kernel)
+    {
+      kernel_storage_node *p_tmp_kernel = p_kernel->next;
+      free(p_kernel);
+      p_kernel = p_tmp_kernel;
+    }
+    free(p_context);
+    p_context = p_tmp_context;
+  }
+}
+
+typedef struct time_element
+{
+  char kernel_name[MAX_KERNEL_NAME_LENGTH];
+  float kernel_sum_time;
+  int kernel_execute_count;
+  double dev;
+} time_element;
+
+static int cmp(const void *a, const void *b)
+{
+  if(((time_element *)a)->kernel_sum_time < ((time_element *)b)->kernel_sum_time)
+    return 1;
+  else if(((time_element *)a)->kernel_sum_time > ((time_element *)b)->kernel_sum_time)
+    return -1;
+  else
+    return 0;
+}
+
+static void print_time_info()
+{
+  context_storage_node *p_context = record.context_storage;
+  if(NULL == p_context)
+  {
+    printf("Nothing to output !\n");
+    return;
+  }
+
+  int tmp_context_id = 0;
+  while(NULL != p_context)
+  {
+    printf("[------------ CONTEXT %4d ------------]\n", tmp_context_id++);
+    printf("  ->>>> KERNELS TIME SUMMARY <<<<-\n");
+    kernel_storage_node *p_kernel = p_context->kernels_storage;
+    kernel_storage_node *p_tmp_kernel = p_kernel;
+    time_element *te = (time_element *)malloc(sizeof(time_element)*p_context->kernel_count);
+    int i = 0, j = 0;
+    while(NULL != p_tmp_kernel)
+    {
+      te[i].kernel_execute_count = p_tmp_kernel->current_count;
+      strcpy(te[i].kernel_name, p_tmp_kernel->kernel_name);
+      te[i].kernel_sum_time = p_tmp_kernel->kernel_sum_time;
+      float average = p_tmp_kernel->kernel_sum_time / p_tmp_kernel->current_count;
+      float sumsquare = 0.0f;
+      for(j=0; j != p_tmp_kernel->current_count; ++j)
+        sumsquare += pow((p_tmp_kernel->kernel_times[j] - average), 2.0 );
+      te[i++].dev = sqrt(sumsquare/p_tmp_kernel->current_count);
+      p_tmp_kernel = p_tmp_kernel->next;
+    }
+    float sum_time = 0.0f;
+    qsort((void *)te, p_context->kernel_count, sizeof(time_element), cmp);
+    for(i=0; i<p_context->kernel_count; ++i)
+      sum_time += te[i].kernel_sum_time;
+    for(i=0; i<p_context->kernel_count; ++i)
+    {
+      printf("    [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f  Count: %-7d  Ave(ms): %7.2f  Dev: %.1lf%%] \n",
+             te[i].kernel_name,
+             te[i].kernel_sum_time / sum_time * 100,
+             te[i].kernel_sum_time,
+             te[i].kernel_execute_count,
+             te[i].kernel_sum_time / te[i].kernel_execute_count,
+             te[i].dev / te[i].kernel_sum_time * te[i].kernel_execute_count * 100);
+    }
+    free(te);
+    printf("    Total : %.2f\n", sum_time);
+    if(2 != b_output_kernel_perf)
+    {
+      p_context = p_context->next;
+      continue;
+    }
+    p_tmp_kernel = p_kernel;
+    printf("\n  ->>>> KERNELS TIME DETAIL <<<<-\n");
+    while(NULL != p_kernel)
+    {
+      printf("    [Kernel Name : %30s   Time(ms): %.2f]\n", p_kernel->kernel_name, p_kernel->kernel_sum_time);
+      for(i=0; i!=p_kernel->current_count; ++i)
+        printf("      Execution Round%5d : %.2f (ms)\n", i+1, p_kernel->kernel_times[i]);
+      p_kernel = p_kernel->next;
+    }
+    printf("[------------  CONTEXT ENDS------------]\n\n");
+    p_context = p_context->next;
+  }
+  free_storage();
+}
+
+
+static void insert(cl_context context, const char *kernel_name, float time)
+{
+  if(!atexit_registered)
+  {
+    atexit_registered = 1;
+    atexit(print_time_info);
+  }
+  context_storage_node *p_context = find_context(context);
+  kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name);
+  prev_context_pointer = p_context;
+  prev_kernel_pointer = p_kernel;
+  p_kernel->kernel_times[p_kernel->current_count++] = time;
+  p_kernel->kernel_sum_time += time;
+  if(p_kernel->kernel_sum_time > p_context->kernel_max_time)
+  {
+    p_context->kernel_max_time = p_kernel->kernel_sum_time;
+    strcpy(p_context->max_time_kernel_name, kernel_name);
+  }
+}
+
+
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+int b_output_kernel_perf = 0;
+static struct timeval start, end;
+
+void initialize_env_var()
+{
+  char *env = getenv("OCL_OUTPUT_KERNEL_PERF");
+  if(NULL == env || !strcmp(env,"0"))
+    b_output_kernel_perf = 0;
+  else if(!strcmp(env,"1"))
+    b_output_kernel_perf = 1;
+  else
+    b_output_kernel_perf = 2;
+}
+
+void time_start(cl_context context, const char * kernel_name, cl_command_queue cq)
+{
+  pthread_mutex_lock(&mutex);
+  gettimeofday(&start, NULL);
+}
+
+void time_end(cl_context context, const char * kernel_name, cl_command_queue cq)
+{
+  clFinish(cq);
+  gettimeofday(&end, NULL);
+  float t = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - start.tv_usec)/1000.0f;
+  insert(context, kernel_name, t);
+  pthread_mutex_unlock(&mutex);
+}
diff --git a/src/performance.h b/src/performance.h
new file mode 100644 (file)
index 0000000..c747743
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef __PERFORMANCE_H__
+#define __PERFORMANCE_H__
+#include "CL/cl.h"
+
+
+extern int b_output_kernel_perf;
+void time_start(cl_context context, const char * kernel_name, cl_command_queue cq);
+void time_end(cl_context context, const char * kernel_name, cl_command_queue cq);
+void initialize_env_var();
+
+
+#endif