Added one more test for the arrays

author Benjamin Segovia <segovia.benjamin@gmail.com>

Thu, 17 May 2012 15:31:32 +0000 (15:31 +0000)

committer Keith Packard <keithp@keithp.com>

Fri, 10 Aug 2012 23:17:38 +0000 (16:17 -0700)
author Benjamin Segovia <segovia.benjamin@gmail.com>
Thu, 17 May 2012 15:31:32 +0000 (15:31 +0000)
committer Keith Packard <keithp@keithp.com>
Fri, 10 Aug 2012 23:17:38 +0000 (16:17 -0700)
diff --git a/kernels/compiler_array.cl b/kernels/compiler_array.cl

index 7bd6060..5dce4d9 100644 (file)
--- a/kernels/compiler_array.cl
+++ b/kernels/compiler_array.cl
@@ -12,4 +12,3 @@ compiler_array(__global int *src, __global int *dst)
    dst[get_global_id(0)] = array[get_local_id(0)];
  }
  
-
diff --git a/kernels/compiler_array0.cl b/kernels/compiler_array0.cl

new file mode 100644 (file)

index 0000000..3ab0fb8
--- /dev/null
+++ b/kernels/compiler_array0.cl
@@ -0,0 +1,16 @@
+__kernel void
+compiler_array0(__global int *src, __global int *dst)
+{
+  int i;
+  int final[16];
+  for (i = 0; i < 16; ++i) {
+    int array[16], j;
+    for (j = 0; j < 16; ++j)
+      array[j] = get_global_id(0);
+    for (j = 0; j < src[0]; ++j)
+      array[j] = 1+src[j];
+    final[i] = array[i];
+  }
+  dst[get_global_id(0)] = final[get_global_id(0)];
+}
+
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c

index 0d4ffb5..5da418c 100644 (file)
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -141,8 +141,6 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
  {
    cl_context ctx = ker->program->ctx;
    cl_device_id device = ctx->device;
-  cl_buffer_mgr bufmgr = cl_context_get_bufmgr(ctx);
-  cl_buffer buffer = NULL;
    const int32_t per_lane_stack_sz = gbe_kernel_get_stack_size(ker->opaque);
    const int32_t value = GBE_CURBE_EXTRA_ARGUMENT;
    const int32_t sub_value = GBE_STACK_BUFFER;
@@ -160,8 +158,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
    stack_sz *= gbe_kernel_get_simd_width(ker->opaque);
    stack_sz *= device->max_compute_unit;
    stack_sz *= device->max_thread_per_unit;
-  buffer = cl_buffer_alloc(bufmgr, NULL, stack_sz, 64);
-  cl_gpgpu_bind_buf(gpgpu, buffer, offset, cc_llc_l3);
+  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3);
  }
  
  LOCAL cl_int
diff --git a/src/cl_driver.c b/src/cl_driver.c

index 9c8bfa2..66d805d 100644 (file)
--- a/src/cl_driver.c
+++ b/src/cl_driver.c
@@ -45,6 +45,7 @@ LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
  LOCAL cl_gpgpu_new_cb *cl_gpgpu_new = NULL;
  LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
  LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
+LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
  LOCAL cl_gpgpu_bind_image2D_cb *cl_gpgpu_bind_image2D = NULL;
  LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
  LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
diff --git a/src/cl_driver.h b/src/cl_driver.h

index 75df8dd..1caf055 100644 (file)
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -23,7 +23,7 @@
  #include <stdint.h>
  #include <stdlib.h>
  
-/* XXX needed for previous driver */
+/* Various limitations we should remove actually */
  #define GEN_MAX_SURFACES 128
  #define GEN_MAX_SAMPLERS 16
  
@@ -118,6 +118,10 @@ typedef void (cl_gpgpu_bind_image2D_cb)(cl_gpgpu state,
                                          cl_gpgpu_tiling tiling);
  extern cl_gpgpu_bind_image2D_cb *cl_gpgpu_bind_image2D;
  
+/* Setup a stack */
+typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t size, uint32_t cchint);
+extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack;
+
  /* Configure internal state */
  typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry);
  extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c

index dd284e0..4e42afb 100644 (file)
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -63,6 +63,7 @@ struct intel_gpgpu
    uint32_t binded_offset[max_buf_n];    /* their offsets in the constant buffer */
    uint32_t binded_n;                    /* number of buffers binded */
  
+  struct { drm_intel_bo *bo; } stack_b;
    struct { drm_intel_bo *bo; } idrt_b;
    struct { drm_intel_bo *bo; } surface_heap_b;
    struct { drm_intel_bo *bo; } vfe_state_b;
@@ -81,24 +82,26 @@ struct intel_gpgpu
  typedef struct intel_gpgpu intel_gpgpu_t;
  
  static void
-intel_gpgpu_delete(intel_gpgpu_t *state)
+intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
  {
-  if (state == NULL)
+  if (gpgpu == NULL)
      return;
-  if (state->surface_heap_b.bo)
-    drm_intel_bo_unreference(state->surface_heap_b.bo);
-  if (state->idrt_b.bo)
-    drm_intel_bo_unreference(state->idrt_b.bo);
-  if (state->vfe_state_b.bo)
-    drm_intel_bo_unreference(state->vfe_state_b.bo);
-  if (state->curbe_b.bo)
-    drm_intel_bo_unreference(state->curbe_b.bo);
-  if (state->sampler_state_b.bo)
-    drm_intel_bo_unreference(state->sampler_state_b.bo);
-  if (state->perf_b.bo)
-    drm_intel_bo_unreference(state->perf_b.bo);
-  intel_batchbuffer_delete(state->batch);
-  cl_free(state);
+  if (gpgpu->surface_heap_b.bo)
+    drm_intel_bo_unreference(gpgpu->surface_heap_b.bo);
+  if (gpgpu->idrt_b.bo)
+    drm_intel_bo_unreference(gpgpu->idrt_b.bo);
+  if (gpgpu->vfe_state_b.bo)
+    drm_intel_bo_unreference(gpgpu->vfe_state_b.bo);
+  if (gpgpu->curbe_b.bo)
+    drm_intel_bo_unreference(gpgpu->curbe_b.bo);
+  if (gpgpu->sampler_state_b.bo)
+    drm_intel_bo_unreference(gpgpu->sampler_state_b.bo);
+  if (gpgpu->perf_b.bo)
+    drm_intel_bo_unreference(gpgpu->perf_b.bo);
+  if (gpgpu->stack_b.bo)
+    drm_intel_bo_unreference(gpgpu->stack_b.bo);
+  intel_batchbuffer_delete(gpgpu->batch);
+  cl_free(gpgpu);
  }
  
  static intel_gpgpu_t*
@@ -121,98 +124,96 @@ error:
  }
  
  static void
-intel_gpgpu_select_pipeline(intel_gpgpu_t *state)
+intel_gpgpu_select_pipeline(intel_gpgpu_t *gpgpu)
  {
-  BEGIN_BATCH(state->batch, 1);
-  OUT_BATCH(state->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
-  ADVANCE_BATCH(state->batch);
+  BEGIN_BATCH(gpgpu->batch, 1);
+  OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
+  ADVANCE_BATCH(gpgpu->batch);
  }
  
  static void
-intel_gpgpu_set_base_address(intel_gpgpu_t *state)
+intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
  {
    const uint32_t def_cc = cc_llc_l3; /* default Cache Control value */
-  BEGIN_BATCH(state->batch, 10);
-  OUT_BATCH(state->batch, CMD_STATE_BASE_ADDRESS | 8);
+  BEGIN_BATCH(gpgpu->batch, 10);
+  OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 8);
    /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
-  OUT_BATCH(state->batch, 0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY);    /* General State Base Addr   */
+  OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY);    /* General State Base Addr   */
    /* 0, State Mem Obj CC */
    /* We use a state base address for the surface heap since IVB clamp the
     * binding table pointer at 11 bits. So, we cannot use pointers directly while
     * using the surface heap
     */
-  OUT_RELOC(state->batch, state->surface_heap_b.bo,
+  OUT_RELOC(gpgpu->batch, gpgpu->surface_heap_b.bo,
              I915_GEM_DOMAIN_INSTRUCTION,
              I915_GEM_DOMAIN_INSTRUCTION,
              0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY);
-  OUT_BATCH(state->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Dynamic State Base Addr */
-  OUT_BATCH(state->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
-  OUT_BATCH(state->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr  */
+  OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Dynamic State Base Addr */
+  OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
+  OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr  */
    /* If we output an AUB file, we limit the total size to 64MB */
  #if USE_FULSIM
-  OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound */
-  OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound */
-  OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound */
-  OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */
+  OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound */
+  OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound */
+  OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound */
+  OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */
  #else
-  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
-  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
-  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
-  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
+  OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+  OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+  OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+  OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
  #endif /* USE_FULSIM */
-  ADVANCE_BATCH(state->batch);
+  ADVANCE_BATCH(gpgpu->batch);
  }
  
  static void
-intel_gpgpu_load_vfe_state(intel_gpgpu_t *state)
+intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
  {
-  BEGIN_BATCH(state->batch, 8);
-  OUT_BATCH(state->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
+  BEGIN_BATCH(gpgpu->batch, 8);
+  OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
  
    gen6_vfe_state_inline_t* vfe = (gen6_vfe_state_inline_t*)
-    intel_batchbuffer_alloc_space(state->batch,0);
+    intel_batchbuffer_alloc_space(gpgpu->batch,0);
  
    memset(vfe, 0, sizeof(struct gen6_vfe_state_inline));
    vfe->vfe1.gpgpu_mode = 1;
    vfe->vfe1.bypass_gateway_ctl = 1;
    vfe->vfe1.reset_gateway_timer = 1;
-  vfe->vfe1.max_threads = state->max_threads - 1;
+  vfe->vfe1.max_threads = gpgpu->max_threads - 1;
    vfe->vfe1.urb_entries = 64;
    vfe->vfe3.curbe_size = 480;
    vfe->vfe4.scoreboard_mask = 0;
-  //vfe->vfe3.urb_size = 13;
-  //vfe->vfe4.scoreboard_mask = (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) ? 0 : 0x80000000;
-  intel_batchbuffer_alloc_space(state->batch, sizeof(gen6_vfe_state_inline_t));
-  ADVANCE_BATCH(state->batch);
+  intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_vfe_state_inline_t));
+  ADVANCE_BATCH(gpgpu->batch);
  }
  
  static void
-intel_gpgpu_load_constant_buffer(intel_gpgpu_t *state) 
+intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu) 
  {
-  BEGIN_BATCH(state->batch, 4);
-  OUT_BATCH(state->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
-  OUT_BATCH(state->batch, 0);                     /* mbz */
+  BEGIN_BATCH(gpgpu->batch, 4);
+  OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
+  OUT_BATCH(gpgpu->batch, 0);                     /* mbz */
  // XXX
  #if 1
-  OUT_BATCH(state->batch,
-            state->urb.size_cs_entry*
-            state->urb.num_cs_entries*32);
+  OUT_BATCH(gpgpu->batch,
+            gpgpu->urb.size_cs_entry*
+            gpgpu->urb.num_cs_entries*32);
  #else
-  OUT_BATCH(state->batch, 5120);
+  OUT_BATCH(gpgpu->batch, 5120);
  #endif
-  OUT_RELOC(state->batch, state->curbe_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
-  ADVANCE_BATCH(state->batch);
+  OUT_RELOC(gpgpu->batch, gpgpu->curbe_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+  ADVANCE_BATCH(gpgpu->batch);
  }
  
  static void
-intel_gpgpu_load_idrt(intel_gpgpu_t *state) 
-{
-  BEGIN_BATCH(state->batch, 4);
-  OUT_BATCH(state->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
-  OUT_BATCH(state->batch, 0);                    /* mbz */
-  OUT_BATCH(state->batch, 1 << 5);
-  OUT_RELOC(state->batch, state->idrt_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
-  ADVANCE_BATCH(state->batch);
+intel_gpgpu_load_idrt(intel_gpgpu_t *gpgpu) 
+{
+  BEGIN_BATCH(gpgpu->batch, 4);
+  OUT_BATCH(gpgpu->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
+  OUT_BATCH(gpgpu->batch, 0);                    /* mbz */
+  OUT_BATCH(gpgpu->batch, 1 << 5);
+  OUT_RELOC(gpgpu->batch, gpgpu->idrt_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+  ADVANCE_BATCH(gpgpu->batch);
  }
  
  static const uint32_t gpgpu_l3_config_reg1[] =
@@ -303,11 +304,11 @@ enum GFX3DSTATE_PIPELINED_SUBOPCODE
  };
  
  static void
-intel_gpgpu_pipe_control(intel_gpgpu_t *state)
+intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
  {
-  BEGIN_BATCH(state->batch, SIZEOF32(gen6_pipe_control_t));
+  BEGIN_BATCH(gpgpu->batch, SIZEOF32(gen6_pipe_control_t));
    gen6_pipe_control_t* pc = (gen6_pipe_control_t*)
-    intel_batchbuffer_alloc_space(state->batch, 0);
+    intel_batchbuffer_alloc_space(gpgpu->batch, 0);
    memset(pc, 0, sizeof(*pc));
    pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2;
    pc->dw0.instruction_subopcode = GFX3DSUBOP_3DCONTROL;
@@ -317,167 +318,169 @@ intel_gpgpu_pipe_control(intel_gpgpu_t *state)
    pc->dw1.render_target_cache_flush_enable = 1;
    pc->dw1.cs_stall = 1;
    pc->dw1.dc_flush_enable = 1;
-  ADVANCE_BATCH(state->batch);
+  ADVANCE_BATCH(gpgpu->batch);
  }
  
  static void
-intel_gpgpu_set_L3(intel_gpgpu_t *state, uint32_t use_barrier)
+intel_gpgpu_set_L3(intel_gpgpu_t *gpgpu, uint32_t use_barrier)
  {
-  BEGIN_BATCH(state->batch, 6);
-  OUT_BATCH(state->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
-  OUT_BATCH(state->batch, L3_CNTL_REG2_ADDRESS_OFFSET);
+  BEGIN_BATCH(gpgpu->batch, 6);
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, L3_CNTL_REG2_ADDRESS_OFFSET);
    if (use_barrier)
-    OUT_BATCH(state->batch, gpgpu_l3_config_reg1[8]);
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[8]);
    else
-    OUT_BATCH(state->batch, gpgpu_l3_config_reg1[4]);
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
  
-  OUT_BATCH(state->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
-  OUT_BATCH(state->batch, L3_CNTL_REG3_ADDRESS_OFFSET);
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, L3_CNTL_REG3_ADDRESS_OFFSET);
    if (use_barrier)
-    OUT_BATCH(state->batch, gpgpu_l3_config_reg2[8]);
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[8]);
    else
-    OUT_BATCH(state->batch, gpgpu_l3_config_reg2[4]);
-  ADVANCE_BATCH(state->batch);
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
+  ADVANCE_BATCH(gpgpu->batch);
  
-  intel_gpgpu_pipe_control(state);
+  intel_gpgpu_pipe_control(gpgpu);
  }
  
  static void
-intel_gpgpu_batch_start(intel_gpgpu_t *state)
-{
-  intel_batchbuffer_start_atomic(state->batch, 256);
-  intel_gpgpu_pipe_control(state);
-  if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
-    intel_gpgpu_set_L3(state, state->ker->use_barrier);
-  intel_gpgpu_select_pipeline(state);
-  intel_gpgpu_set_base_address(state);
-  intel_gpgpu_load_vfe_state(state);
-  intel_gpgpu_load_constant_buffer(state);
-  intel_gpgpu_load_idrt(state);
-
-  if (state->perf_b.bo) {
-    BEGIN_BATCH(state->batch, 3);
-    OUT_BATCH(state->batch,
+intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
+{
+  intel_batchbuffer_start_atomic(gpgpu->batch, 256);
+  intel_gpgpu_pipe_control(gpgpu);
+  intel_gpgpu_set_L3(gpgpu, gpgpu->ker->use_barrier);
+  intel_gpgpu_select_pipeline(gpgpu);
+  intel_gpgpu_set_base_address(gpgpu);
+  intel_gpgpu_load_vfe_state(gpgpu);
+  intel_gpgpu_load_constant_buffer(gpgpu);
+  intel_gpgpu_load_idrt(gpgpu);
+
+  if (gpgpu->perf_b.bo) {
+    BEGIN_BATCH(gpgpu->batch, 3);
+    OUT_BATCH(gpgpu->batch,
                (0x28 << 23) | /* MI_REPORT_PERF_COUNT */
                (3 - 2));      /* length-2 */
-    OUT_RELOC(state->batch, state->perf_b.bo,
+    OUT_RELOC(gpgpu->batch, gpgpu->perf_b.bo,
                I915_GEM_DOMAIN_RENDER,
                I915_GEM_DOMAIN_RENDER,
                0 |  /* Offset for the start "counters" */
                1);  /* Use GTT and not PGTT */
-    OUT_BATCH(state->batch, 0);
-    ADVANCE_BATCH(state->batch);
+    OUT_BATCH(gpgpu->batch, 0);
+    ADVANCE_BATCH(gpgpu->batch);
    }
  }
  
  static void
-intel_gpgpu_batch_end(intel_gpgpu_t *state, int32_t flush_mode)
+intel_gpgpu_batch_end(intel_gpgpu_t *gpgpu, int32_t flush_mode)
  {
    /* Insert the performance counter command */
-  if (state->perf_b.bo) {
-    BEGIN_BATCH(state->batch, 3);
-    OUT_BATCH(state->batch,
+  if (gpgpu->perf_b.bo) {
+    BEGIN_BATCH(gpgpu->batch, 3);
+    OUT_BATCH(gpgpu->batch,
                (0x28 << 23) | /* MI_REPORT_PERF_COUNT */
                (3 - 2));      /* length-2 */
-    OUT_RELOC(state->batch, state->perf_b.bo,
+    OUT_RELOC(gpgpu->batch, gpgpu->perf_b.bo,
                I915_GEM_DOMAIN_RENDER,
                I915_GEM_DOMAIN_RENDER,
                512 |  /* Offset for the end "counters" */
                1);    /* Use GTT and not PGTT */
-    OUT_BATCH(state->batch, 0);
-    ADVANCE_BATCH(state->batch);
+    OUT_BATCH(gpgpu->batch, 0);
+    ADVANCE_BATCH(gpgpu->batch);
    }
  
-  if(flush_mode) intel_gpgpu_pipe_control(state);
-  intel_batchbuffer_end_atomic(state->batch);
+  if(flush_mode) intel_gpgpu_pipe_control(gpgpu);
+  intel_batchbuffer_end_atomic(gpgpu->batch);
  }
  
  static void
-intel_gpgpu_batch_reset(intel_gpgpu_t *state, size_t sz)
+intel_gpgpu_batch_reset(intel_gpgpu_t *gpgpu, size_t sz)
  {
-  intel_batchbuffer_reset(state->batch, sz);
+  intel_batchbuffer_reset(gpgpu->batch, sz);
  }
  
  static void
-intel_gpgpu_flush(intel_gpgpu_t *state)
+intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
  {
-  intel_batchbuffer_flush(state->batch);
+  intel_batchbuffer_flush(gpgpu->batch);
  }
  
  static void
-intel_gpgpu_state_init(intel_gpgpu_t *state,
+intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
                         uint32_t max_threads,
                         uint32_t size_cs_entry)
  {
-  dri_bo *bo;
+  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+  drm_intel_bo *bo;
  
    /* Binded buffers */
-  state->binded_n = 0;
+  gpgpu->binded_n = 0;
  
    /* URB */
-  state->urb.num_cs_entries = 64;
-  state->urb.size_cs_entry = size_cs_entry;
-  state->max_threads = max_threads;
-
-  /* constant buffer */
-  if(state->curbe_b.bo)
-    dri_bo_unreference(state->curbe_b.bo);
-  uint32_t size_cb = state->urb.num_cs_entries * state->urb.size_cs_entry * 64;
+  gpgpu->urb.num_cs_entries = 64;
+  gpgpu->urb.size_cs_entry = size_cs_entry;
+  gpgpu->max_threads = max_threads;
+
+  /* Constant buffer */
+  if(gpgpu->curbe_b.bo)
+    dri_bo_unreference(gpgpu->curbe_b.bo);
+  uint32_t size_cb = gpgpu->urb.num_cs_entries * gpgpu->urb.size_cs_entry * 64;
    size_cb = ALIGN(size_cb, 4096);
-  bo = dri_bo_alloc(state->drv->bufmgr,
-                    "CONSTANT_BUFFER",
-                    size_cb,
-                    64);
+  bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", size_cb, 64);
    assert(bo);
-  state->curbe_b.bo = bo;
+  gpgpu->curbe_b.bo = bo;
  
    /* surface state */
-  if(state->surface_heap_b.bo)
-    dri_bo_unreference(state->surface_heap_b.bo);
-  bo = dri_bo_alloc(state->drv->bufmgr, 
+  if(gpgpu->surface_heap_b.bo)
+    dri_bo_unreference(gpgpu->surface_heap_b.bo);
+  bo = dri_bo_alloc(bufmgr, 
                      "SURFACE_HEAP",
                      sizeof(surface_heap_t),
                      32);
    assert(bo);
    dri_bo_map(bo, 1);
    memset(bo->virtual, 0, sizeof(surface_heap_t));
-  state->surface_heap_b.bo = bo;
+  gpgpu->surface_heap_b.bo = bo;
  
    /* Interface descriptor remap table */
-  if(state->idrt_b.bo)
-    dri_bo_unreference(state->idrt_b.bo);
-  bo = dri_bo_alloc(state->drv->bufmgr, 
+  if(gpgpu->idrt_b.bo)
+    dri_bo_unreference(gpgpu->idrt_b.bo);
+  bo = dri_bo_alloc(bufmgr, 
                      "IDRT",
                      MAX_IF_DESC * sizeof(struct gen6_interface_descriptor),
                      32);
    assert(bo);
-  state->idrt_b.bo = bo;
+  gpgpu->idrt_b.bo = bo;
  
    /* vfe state */
-  if(state->vfe_state_b.bo)
-    dri_bo_unreference(state->vfe_state_b.bo);
-  state->vfe_state_b.bo = NULL;
+  if(gpgpu->vfe_state_b.bo)
+    dri_bo_unreference(gpgpu->vfe_state_b.bo);
+  gpgpu->vfe_state_b.bo = NULL;
  
    /* sampler state */
-  if (state->sampler_state_b.bo)
-    dri_bo_unreference(state->sampler_state_b.bo);
-  bo = dri_bo_alloc(state->drv->bufmgr, 
-                    "sample states",
+  if (gpgpu->sampler_state_b.bo)
+    dri_bo_unreference(gpgpu->sampler_state_b.bo);
+  bo = dri_bo_alloc(gpgpu->drv->bufmgr, 
+                    "SAMPLER_STATE",
                      GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t),
                      32);
    assert(bo);
    dri_bo_map(bo, 1);
    memset(bo->virtual, 0, sizeof(gen6_sampler_state_t) * GEN_MAX_SAMPLERS);
-  state->sampler_state_b.bo = bo;
+  gpgpu->sampler_state_b.bo = bo;
+
+  /* stack */
+  if (gpgpu->stack_b.bo)
+    dri_bo_unreference(gpgpu->stack_b.bo);
+  gpgpu->stack_b.bo = NULL;
  }
  
  static void
-intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *state, int32_t index, dri_bo* obj_bo)
+intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo)
  {
-  surface_heap_t *heap = state->surface_heap_b.bo->virtual;
+  surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
    heap->binding_table[index] = offsetof(surface_heap_t, surface) +
                                 index * sizeof(gen7_surface_state_t);
-  dri_bo_emit_reloc(state->surface_heap_b.bo,
+  dri_bo_emit_reloc(gpgpu->surface_heap_b.bo,
                      I915_GEM_DOMAIN_RENDER,
                      I915_GEM_DOMAIN_RENDER,
                      0,
@@ -491,9 +494,9 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *state, int32_t index, dri_bo* obj_
   * surface but Fulsim complains
   */
  static void
-intel_gpgpu_map_address_space(intel_gpgpu_t *state)
+intel_gpgpu_map_address_space(intel_gpgpu_t *gpgpu)
  {
-  surface_heap_t *heap = state->surface_heap_b.bo->virtual;
+  surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
    gen7_surface_state_t *ss0 = (gen7_surface_state_t *) heap->surface[0];
    gen7_surface_state_t *ss1 = (gen7_surface_state_t *) heap->surface[1];
    memset(ss0, 0, sizeof(gen7_surface_state_t));
@@ -510,7 +513,7 @@ intel_gpgpu_map_address_space(intel_gpgpu_t *state)
  }
  
  static void
-intel_gpgpu_bind_image2D_gen7(intel_gpgpu_t *state,
+intel_gpgpu_bind_image2D_gen7(intel_gpgpu_t *gpgpu,
                                int32_t index,
                                dri_bo* obj_bo,
                                uint32_t format,
@@ -519,7 +522,7 @@ intel_gpgpu_bind_image2D_gen7(intel_gpgpu_t *state,
                                int32_t pitch,
                                int32_t tiling)
  {
-  surface_heap_t *heap = state->surface_heap_b.bo->virtual;
+  surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
    gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
    memset(ss, 0, sizeof(*ss));
    ss->ss0.surface_type = I965_SURFACE_2D;
@@ -536,7 +539,7 @@ intel_gpgpu_bind_image2D_gen7(intel_gpgpu_t *state,
      ss->ss0.tiled_surface = 1;
      ss->ss0.tile_walk = I965_TILEWALK_YMAJOR;
    }
-  intel_gpgpu_set_buf_reloc_gen7(state, index, obj_bo);
+  intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo);
  }
  
  static void
@@ -546,18 +549,18 @@ intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset, u
    gpgpu->binded_buf[gpgpu->binded_n] = buf;
    gpgpu->binded_offset[gpgpu->binded_n] = offset;
    gpgpu->binded_n++;
-#if 0
-  const uint32_t size = obj_bo->size;
-  assert(index < GEN_MAX_SURFACES);
-  if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
-    intel_gpgpu_bind_buf_gen7(state, index, obj_bo, size, cchint);
-  else
-    NOT_IMPLEMENTED;
-#endif
  }
  
  static void
-intel_gpgpu_bind_image2D(intel_gpgpu_t *state,
+intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint32_t cchint)
+{
+  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+  gpgpu->stack_b.bo = drm_intel_bo_alloc(bufmgr, "STACK", size, 64);
+  intel_gpgpu_bind_buf(gpgpu, gpgpu->stack_b.bo, offset, cchint);
+}
+
+static void
+intel_gpgpu_bind_image2D(intel_gpgpu_t *gpgpu,
                           int32_t index,
                           cl_buffer *obj_bo,
                           uint32_t format,
@@ -567,19 +570,16 @@ intel_gpgpu_bind_image2D(intel_gpgpu_t *state,
                           cl_gpgpu_tiling tiling)
  {
    assert(index < GEN_MAX_SURFACES);
-  if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
-    intel_gpgpu_bind_image2D_gen7(state, index, (drm_intel_bo*) obj_bo, format, w, h, pitch, tiling);
-  else
-    NOT_IMPLEMENTED;
+  intel_gpgpu_bind_image2D_gen7(gpgpu, index, (drm_intel_bo*) obj_bo, format, w, h, pitch, tiling);
  }
  
  static void
-intel_gpgpu_build_idrt(intel_gpgpu_t *state, cl_gpgpu_kernel *kernel)
+intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
  {
    gen6_interface_descriptor_t *desc;
    drm_intel_bo *bo = NULL, *ker_bo = NULL;
  
-  bo = state->idrt_b.bo;
+  bo = gpgpu->idrt_b.bo;
    dri_bo_map(bo, 1);
    assert(bo->virtual);
    desc = (gen6_interface_descriptor_t*) bo->virtual;
@@ -588,14 +588,14 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *state, cl_gpgpu_kernel *kernel)
    ker_bo = (drm_intel_bo *) kernel->bo;
    desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
    desc->desc1.single_program_flow = 1;
-  desc->desc2.sampler_state_pointer = state->sampler_state_b.bo->offset >> 5;
+  desc->desc2.sampler_state_pointer = gpgpu->sampler_state_b.bo->offset >> 5;
    desc->desc3.binding_table_entry_count = 0; /* no prefetch */
    desc->desc3.binding_table_pointer = 0;
    desc->desc4.curbe_read_len = kernel->cst_sz / 32;
    desc->desc4.curbe_read_offset = 0;
  
    /* Barriers / SLM are automatically handled on Gen7+ */
-  if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) {
+  if (gpgpu->drv->gen_ver == 7 || gpgpu->drv->gen_ver == 75) {
      size_t slm_sz = kernel->slm_sz;
      desc->desc5.group_threads_num = kernel->use_barrier ? kernel->thread_n : 0;
      desc->desc5.barrier_enable = kernel->use_barrier;
@@ -627,7 +627,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *state, cl_gpgpu_kernel *kernel)
                      I915_GEM_DOMAIN_INSTRUCTION, 0,
                      0,
                      offsetof(gen6_interface_descriptor_t, desc2),
-                    state->sampler_state_b.bo);
+                    gpgpu->sampler_state_b.bo);
    dri_bo_unmap(bo);
  }
  
@@ -730,6 +730,7 @@ intel_set_gpgpu_callbacks(void)
    cl_gpgpu_delete = (cl_gpgpu_delete_cb *) intel_gpgpu_delete;
    cl_gpgpu_bind_image2D = (cl_gpgpu_bind_image2D_cb *) intel_gpgpu_bind_image2D;
    cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *) intel_gpgpu_bind_buf;
+  cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
    cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
    cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
    cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) intel_gpgpu_upload_constants;
diff --git a/src/sim/sim_driver.c b/src/sim/sim_driver.c

index 0d18ccd..49a9e89 100644 (file)
--- a/src/sim/sim_driver.c
+++ b/src/sim/sim_driver.c
@@ -196,6 +196,7 @@ struct _sim_gpgpu
    sim_driver driver;                 /* the driver the gpgpu states belongs to */
    sim_kernel_cb *kernel;             /* call it for each HW thread */
    sim_buffer binded_buf[max_buf_n];  /* all buffers binded for the call */
+  sim_buffer stack;                  /* used only when stack is required */
    char *fake_memory;                 /* fake memory to emulate flat address space in any mode (32 / 64 bits) */
    char *curbe;                       /* constant buffer */
    uint32_t binded_offset[max_buf_n]; /* their offsets in the constant buffer */
@@ -210,6 +211,7 @@ typedef struct _sim_gpgpu *sim_gpgpu;
  static void sim_gpgpu_delete(sim_gpgpu gpgpu) {
    if (gpgpu->fake_memory) cl_free(gpgpu->fake_memory);
    if (gpgpu->curbe) cl_free(gpgpu->curbe);
+  if (gpgpu->stack) sim_buffer_delete(gpgpu->stack);
    cl_free(gpgpu);
  }
  
@@ -217,6 +219,7 @@ static sim_gpgpu sim_gpgpu_new(sim_driver driver)
  {
    sim_gpgpu gpgpu = NULL;
    TRY_ALLOC_NO_ERR(gpgpu, cl_calloc(1, sizeof(struct _sim_gpgpu)));
+  gpgpu->driver = driver;
  exit:
    return gpgpu;
  error:
@@ -264,7 +267,12 @@ static void
  sim_gpgpu_state_init(sim_gpgpu gpgpu, uint32_t max_threads, uint32_t size_cs_entry)
  {
    assert(gpgpu);
-  memset(gpgpu, 0, sizeof(*gpgpu));
+  if (gpgpu->stack)
+    sim_buffer_delete(gpgpu->stack);
+  gpgpu->fake_memory = NULL;
+  gpgpu->curbe = NULL;
+  gpgpu->binded_n = 0;
+  gpgpu->thread_n = 0;
    gpgpu->curbe_sz = size_cs_entry * 32;
    gpgpu->max_threads = max_threads;
  }
@@ -306,6 +314,14 @@ sim_gpgpu_bind_buf(sim_gpgpu gpgpu, sim_buffer buf, uint32_t offset, uint32_t cc
  }
  
  static void
+sim_gpgpu_set_stack(sim_gpgpu gpgpu, uint32_t offset, uint32_t size, uint32_t cchint)
+{
+  sim_bufmgr bufmgr = gpgpu->driver->bufmgr;
+  gpgpu->stack = sim_buffer_alloc(bufmgr, "STACK", size, 64);
+  sim_gpgpu_bind_buf(gpgpu, gpgpu->stack, offset, cchint);
+}
+
+static void
  sim_gpgpu_walker(sim_gpgpu gpgpu,
                   uint32_t simd_sz,
                   uint32_t thread_n,
@@ -364,6 +380,7 @@ sim_setup_callbacks(void)
    cl_gpgpu_delete = (cl_gpgpu_delete_cb *) sim_gpgpu_delete;
    cl_gpgpu_bind_image2D = (cl_gpgpu_bind_image2D_cb *) sim_gpgpu_bind_image2D;
    cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *) sim_gpgpu_bind_buf;
+  cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) sim_gpgpu_set_stack;
    cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) sim_gpgpu_state_init;
    cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) sim_gpgpu_set_perf_counters;
    cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) sim_gpgpu_upload_constants;
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt

index 507ef0d..0fd5e82 100644 (file)
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -28,7 +28,8 @@ ADD_LIBRARY(utests SHARED
              compiler_lower_return0.cpp
              compiler_lower_return1.cpp
              compiler_lower_return2.cpp
-            compiler_array.cpp)
+            compiler_array.cpp
+            compiler_array0.cpp)
  TARGET_LINK_LIBRARIES(utests cl m)
  
  ADD_EXECUTABLE(run utest_run.cpp)
diff --git a/utests/compiler_array0.cpp b/utests/compiler_array0.cpp

new file mode 100644 (file)

index 0000000..9e3535d
--- /dev/null
+++ b/utests/compiler_array0.cpp
@@ -0,0 +1,73 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+  int i;
+  int final[16];
+  for (i = 0; i < 16; ++i) {
+    int array[16], j;
+    for (j = 0; j < 16; ++j)
+      array[j] = global_id;
+    for (j = 0; j < src[0]; ++j)
+      array[j] = 1+src[j];
+    final[i] = array[i];
+  }
+  dst[global_id] = final[global_id];
+}
+
+void compiler_array0(void)
+{
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_array0");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < 11; ++i)
+      OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_array0);
+
+
author	Benjamin Segovia <segovia.benjamin@gmail.com>
	Thu, 17 May 2012 15:31:32 +0000 (15:31 +0000)
committer	Keith Packard <keithp@keithp.com>
	Fri, 10 Aug 2012 23:17:38 +0000 (16:17 -0700)
kernels/compiler_array.cl		patch \| blob \| history
kernels/compiler_array0.cl	[new file with mode: 0644]	patch \| blob
src/cl_command_queue_gen7.c		patch \| blob \| history
src/cl_driver.c		patch \| blob \| history
src/cl_driver.h		patch \| blob \| history
src/intel/intel_gpgpu.c		patch \| blob \| history
src/sim/sim_driver.c		patch \| blob \| history
utests/CMakeLists.txt		patch \| blob \| history
utests/compiler_array0.cpp	[new file with mode: 0644]	patch \| blob