Implement constant buffer based on constant cache.

author Ruiling Song <ruiling.song@intel.com>

Wed, 4 Sep 2013 06:24:54 +0000 (14:24 +0800)

committer Zhigang Gong <zhigang.gong@linux.intel.com>

Wed, 4 Sep 2013 06:49:27 +0000 (14:49 +0800)
author Ruiling Song <ruiling.song@intel.com>
Wed, 4 Sep 2013 06:24:54 +0000 (14:24 +0800)
committer Zhigang Gong <zhigang.gong@linux.intel.com>
Wed, 4 Sep 2013 06:49:27 +0000 (14:49 +0800)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp

index 5484869..ac3a243 100644 (file)
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -458,15 +458,6 @@ namespace gbe
        }
      });
  #undef INSERT_REG
-    this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, 0, sizeof(int));
-    specialRegs.insert(ir::ocl::constoffst);
-
-    // Insert serialized global constant arrays if used
-    const ir::ConstantSet& constantSet = unit.getConstantSet();
-    if (constantSet.getConstantNum()) {
-      size_t size = constantSet.getDataSize();
-      this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_DATA, 0, size);
-    }
  
      // Insert the number of threads
      this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t));
@@ -640,8 +631,7 @@ namespace gbe
          reg == ir::ocl::goffset0  ||
          reg == ir::ocl::goffset1  ||
          reg == ir::ocl::goffset2  ||
-        reg == ir::ocl::workdim   ||
-        reg == ir::ocl::constoffst)
+        reg == ir::ocl::workdim)
        return true;
      return false;
    }
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp

index 1ad397c..fc205a0 100644 (file)
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2103,6 +2103,23 @@ namespace gbe
        sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
      }
  
+    void emitDWordGather(Selection::Opaque &sel,
+                         const ir::LoadInstruction &insn,
+                         GenRegister addr,
+                         uint32_t bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      GBE_ASSERT(valueNum == 1);
+      GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F);
+      // get dword based address
+      GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+      sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
+
+      sel.DWORD_GATHER(dst, addrDW, bti);
+    }
+
      void emitRead64(Selection::Opaque &sel,
                           const ir::LoadInstruction &insn,
                           GenRegister addr,
@@ -2173,8 +2190,19 @@ namespace gbe
        GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false);
        const Type type = insn.getValueType();
        const uint32_t elemSize = getByteScatterGatherSize(type);
-      if (insn.getAddressSpace() == MEM_CONSTANT)
-        this->emitIndirectMove(sel, insn, address);
+      if (insn.getAddressSpace() == MEM_CONSTANT) {
+        // XXX TODO read 64bit constant through constant cache
+        // Per HW Spec, constant cache messages can read at least DWORD data.
+        // So, byte/short data type, we have to read through data cache.
+        if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+          this->emitRead64(sel, insn, address, 0x2);
+        else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+          this->emitDWordGather(sel, insn, address, 0x2);
+        else {
+          const GenRegister value = sel.selReg(insn.getValue(0));
+          this->emitByteGather(sel, insn, elemSize, address, value, 0x2);
+        }
+      }
        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
          this->emitRead64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp

index 0bb75a2..2abfb12 100644 (file)
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -573,7 +573,6 @@ namespace gbe
      allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2);
      allocatePayloadReg(GBE_CURBE_STACK_POINTER, ocl::stackptr);
      allocatePayloadReg(GBE_CURBE_THREAD_NUM, ocl::threadn);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, ocl::constoffst);
  
      // Group and barrier IDs are always allocated by the hardware in r0
      RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h

index d20e7af..ff4d157 100644 (file)
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -70,8 +70,6 @@ enum gbe_curbe_type {
    GBE_CURBE_GROUP_NUM_Y,
    GBE_CURBE_GROUP_NUM_Z,
    GBE_CURBE_WORK_DIM,
-  GBE_CURBE_GLOBAL_CONSTANT_OFFSET,
-  GBE_CURBE_GLOBAL_CONSTANT_DATA,
    GBE_CURBE_IMAGE_INFO,
    GBE_CURBE_STACK_POINTER,
    GBE_CURBE_KERNEL_ARGUMENT,
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp

index 675018a..927e43d 100644 (file)
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -40,7 +40,6 @@ namespace ir {
          "stack_pointer",
          "block_ip",
          "barrier_id", "thread_number",
-        "const_curbe_offset",
          "work_dimension",
      };
  
@@ -76,7 +75,6 @@ namespace ir {
        DECL_NEW_REG(FAMILY_WORD, blockip);
        DECL_NEW_REG(FAMILY_DWORD, barrierid);
        DECL_NEW_REG(FAMILY_DWORD, threadn);
-      DECL_NEW_REG(FAMILY_DWORD, constoffst);
        DECL_NEW_REG(FAMILY_DWORD, workdim);
      }
  #undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp

index 4b0ef5e..c79bc3b 100644 (file)
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -63,9 +63,8 @@ namespace ir {
      static const Register blockip = Register(19);  // blockip
      static const Register barrierid = Register(20);// barrierid
      static const Register threadn = Register(21);  // number of threads
-    static const Register constoffst = Register(22); // offset of global constant array's curbe
-    static const Register workdim = Register(23);  // work dimention.
-    static const uint32_t regNum = 24;             // number of special registers
+    static const Register workdim = Register(22);  // work dimention.
+    static const uint32_t regNum = 23;             // number of special registers
      extern const char *specialRegMean[];           // special register name.
    } /* namespace ocl */
  
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp

index 12d809d..e747d00 100644 (file)
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1243,12 +1243,7 @@ namespace gbe
        ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
        ir::Constant &con = unit.getConstantSet().getConstant(j ++);
        con.setReg(reg.value());
-      if(con.getOffset() != 0) {
-        ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
-        ctx.ADD(ir::TYPE_S32, reg, ir::ocl::constoffst, reg);
-      } else {
-        ctx.MOV(ir::TYPE_S32, reg, ir::ocl::constoffst);
-      }
+      ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
      }
  
      // Visit all the instructions and emit the IR registers or the value to
@@ -2407,7 +2402,7 @@ namespace gbe
        const ir::Type type = getType(ctx, elemType);
        const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
  
-      if (type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) {
+      if ((type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) && addrSpace != ir::MEM_CONSTANT) {
          // One message is enough here. Nothing special to do
          if (elemNum <= 4) {
            // Build the tuple data in the vector
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c

index 9606d6b..2454db6 100644 (file)
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -150,24 +150,6 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
    return CL_SUCCESS;
  }
  
-LOCAL cl_int cl_command_queue_upload_constant_buffer(cl_kernel k,
-                                                       char * dst)
-{
-  int i;
-  for(i = 0; i < k->arg_n; i++) {
-    enum gbe_arg_type arg_type = gbe_kernel_get_arg_type(k->opaque, i);
-
-    if(arg_type == GBE_ARG_CONSTANT_PTR && k->args[i].mem) {
-      uint32_t offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_EXTRA_ARGUMENT, i+GBE_CONSTANT_BUFFER);
-      cl_mem mem = k->args[i].mem;
-      cl_buffer_map(mem->bo, 1);
-      void * addr = cl_buffer_get_virtual(mem->bo);
-      memcpy(dst + offset, addr, mem->size);
-      cl_buffer_unmap(mem->bo);
-    }
-  }
-  return CL_SUCCESS;
-}
  
  #if USE_FULSIM
  extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr);
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h

index 135d659..9fe1dd1 100644 (file)
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -76,8 +76,5 @@ extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
  
  /* Bind all the image surfaces in the GPGPU state */
  extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel);
-
-/*update constant buffer to final curbe */
-extern cl_int cl_command_queue_upload_constant_buffer(cl_kernel k, char * dst);
  #endif /* __CL_COMMAND_QUEUE_H__ */
  
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c

index 1d415d4..68630cf 100644 (file)
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -76,7 +76,7 @@ cl_set_varying_payload(const cl_kernel ker,
      block_ips[curr] = 0;
    }
  
-  /* Copy them to the constant buffer */
+  /* Copy them to the curbe buffer */
    curr = 0;
    for (i = 0; i < thread_n; ++i, data += cst_sz) {
      uint32_t *ids0 = (uint32_t *) (data + id_offset[0]);
@@ -95,6 +95,62 @@ error:
    return err;
  }
  
+static void
+cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
+{
+  /* calculate constant buffer size */
+  int32_t arg;
+  size_t offset;
+  gbe_program prog = ker->program->opaque;
+  const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
+  size_t global_const_size = gbe_program_get_global_constant_size(prog);
+  uint32_t constant_buf_size = 0;
+  for (arg = 0; arg < arg_n; ++arg) {
+    const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
+    if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+      cl_mem mem = ker->args[arg].mem;
+      constant_buf_size += ALIGN(mem->size, 4);
+    }
+  }
+  if(global_const_size == 0 && constant_buf_size == 0)
+     return;
+
+  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(queue->gpgpu, constant_buf_size + global_const_size + 4);
+  cl_buffer_map(bo, 1);
+  char * cst_addr = cl_buffer_get_virtual(bo);
+  offset = 0;
+  if (global_const_size > 0) {
+    /* Write the global constant arrays */
+    gbe_program_get_global_constant_data(prog, (char*)(cst_addr+offset));
+  }
+  offset += ALIGN(global_const_size, 4);
+
+  if(global_const_size == 0) {
+    /* reserve 4 bytes to get rid of 0 address */
+    offset += 4;
+  }
+
+  /* upload constant buffer argument */
+  int32_t curbe_offset = 0;
+  for (arg = 0; arg < arg_n; ++arg) {
+    const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
+    if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+      cl_mem mem = ker->args[arg].mem;
+
+      curbe_offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
+      assert(curbe_offset >= 0);
+      *(uint32_t *) (ker->curbe + curbe_offset) = offset;
+
+      cl_buffer_map(mem->bo, 1);
+      void * addr = cl_buffer_get_virtual(mem->bo);
+      memcpy(cst_addr + offset, addr, mem->size);
+      cl_buffer_unmap(mem->bo);
+      offset += ALIGN(mem->size, 4);
+    }
+  }
+  cl_buffer_unmap(bo);
+}
+
  /* Will return the total amount of slm used */
  static int32_t
  cl_curbe_fill(cl_kernel ker,
@@ -122,7 +178,6 @@ cl_curbe_fill(cl_kernel ker,
    UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
    UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
    UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
-  UPLOAD(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0) + 32);
  #undef UPLOAD
  
    /* Write identity for the stack pointer. This is required by the stack pointer
@@ -134,14 +189,6 @@ cl_curbe_fill(cl_kernel ker,
      int32_t i;
      for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
    }
-
-  /* Write global constant arrays */
-  if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0)) >= 0) {
-    /* Write the global constant arrays */
-    gbe_program prog = ker->program->opaque;
-    gbe_program_get_global_constant_data(prog, ker->curbe + offset);
-  }
-
    /* Handle the various offsets to SLM */
    const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
    int32_t arg, slm_offset = 0;
@@ -220,9 +267,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
    /* Compute the number of HW threads we need */
    TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
    kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
-  kernel.cst_sz = cst_sz;
+  kernel.curbe_sz = cst_sz;
  
-  /* Curbe step 1: fill the constant buffer data shared by all threads */
+  /* Curbe step 1: fill the constant urb buffer data shared by all threads */
    if (ker->curbe) {
      kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
      if (kernel.slm_sz > ker->program->ctx->device->local_mem_size)
@@ -242,6 +289,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
    cl_setup_scratch(gpgpu, ker);
    /* Bind a stack if needed */
    cl_bind_stack(gpgpu, ker);
+
+  cl_upload_constant_buffer(queue, ker);
+
    cl_gpgpu_states_setup(gpgpu, &kernel);
  
    /* Curbe step 2. Give the localID and upload it to video memory */
@@ -250,10 +300,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
      TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
      for (i = 0; i < thread_n; ++i) {
          memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
-        cl_command_queue_upload_constant_buffer(ker, final_curbe + cst_sz * i);
      }
      TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
-    cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
+    cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz);
    }
  
    /* Start a new batch buffer */
diff --git a/src/cl_driver.h b/src/cl_driver.h

index 0ce03fe..95d6485 100644 (file)
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -100,7 +100,7 @@ typedef enum gpu_command_status {
  typedef struct cl_gpgpu_kernel {
    const char *name;        /* kernel name and bo name */
    uint32_t grf_blocks;     /* register blocks kernel wants (in 8 reg blocks) */
-  uint32_t cst_sz;         /* total size of all constants */
+  uint32_t curbe_sz;         /* total size of all curbes */
    cl_buffer bo;            /* kernel code in the proper addr space */
    int32_t barrierID;       /* barrierID for _this_ kernel */
    uint32_t use_slm:1;      /* For gen7 (automatic barrier management) */
@@ -157,9 +157,12 @@ extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
  typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu, cl_buffer perf);
  extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters;
  
-/* Fills current constant buffer with data */
-typedef void (cl_gpgpu_upload_constants_cb)(cl_gpgpu, const void* data, uint32_t size);
-extern cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants;
+/* Fills current curbe buffer with data */
+typedef void (cl_gpgpu_upload_curbes_cb)(cl_gpgpu, const void* data, uint32_t size);
+extern cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes;
+
+typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t size);
+extern cl_gpgpu_alloc_constant_buffer_cb *cl_gpgpu_alloc_constant_buffer;
  
  /* Setup all indirect states */
  typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu, cl_gpgpu_kernel *kernel);
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c

index 7c4c866..ae130fa 100644 (file)
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -54,8 +54,9 @@ LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
  LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
  LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
  LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
+LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
  LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
-LOCAL cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants = NULL;
+LOCAL cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes = NULL;
  LOCAL cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup = NULL;
  LOCAL cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers = NULL;
  LOCAL cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset = NULL;
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h

index c8d3aac..feb4ab3 100644 (file)
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -51,7 +51,7 @@
  .single_fp_config = 0, /* XXX */
  .global_mem_cache_type = CL_READ_WRITE_CACHE,
  .global_mem_size = 128 * 1024 * 1024,
-.max_constant_buffer_size = 64 << 10,
+.max_constant_buffer_size = 512 << 10,
  .max_constant_args = 8,
  .error_correction_support = CL_FALSE,
  .host_unified_memory = CL_FALSE,
diff --git a/src/cl_kernel.c b/src/cl_kernel.c

index 12a08c5..4ba1c11 100644 (file)
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -186,16 +186,6 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
  
    mem = *(cl_mem*) value;
  
-  if(arg_type == GBE_ARG_CONSTANT_PTR) {
-    int32_t cbOffset;
-    cbOffset = gbe_kernel_set_const_buffer_size(k->opaque, index, mem->size);
-    //constant ptr's curbe offset changed, update it
-    if(cbOffset >= 0) {
-      offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-      *((uint32_t *)(k->curbe + offset)) = cbOffset;  //cb offset in curbe
-    }
-  }
-
    cl_mem_add_ref(mem);
    if (k->args[index].mem)
      cl_mem_delete(k->args[index].mem);
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c

index 9959447..ef6e6c3 100644 (file)
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -380,7 +380,7 @@ cl_intel_driver_new(cl_context_prop props)
    /* We use the first 2 slots(0,1) for all the bufs.
     * Notify the gbe this base index, thus gbe can avoid conflicts
     * when it allocates slots for images*/
-  gbe_set_image_base_index(2);
+  gbe_set_image_base_index(3);
  exit:
    return driver;
  error:
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c

index 073e255..144e10f 100644 (file)
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -79,7 +79,7 @@ struct intel_gpgpu
    intel_batchbuffer_t *batch;
    cl_gpgpu_kernel *ker;
    drm_intel_bo *binded_buf[max_buf_n];  /* all buffers binded for the call */
-  uint32_t binded_offset[max_buf_n];    /* their offsets in the constant buffer */
+  uint32_t binded_offset[max_buf_n];    /* their offsets in the curbe buffer */
    uint32_t binded_n;                    /* number of buffers binded */
  
    unsigned long img_bitmap;              /* image usage bitmap. */
@@ -96,6 +96,7 @@ struct intel_gpgpu
    struct { drm_intel_bo *bo; } sampler_state_b;
    struct { drm_intel_bo *bo; } perf_b;
    struct { drm_intel_bo *bo; } scratch_b;
+  struct { drm_intel_bo *bo; } constant_b;
  
    uint32_t per_thread_scratch;
    struct {
@@ -138,6 +139,9 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
    if (gpgpu->scratch_b.bo)
      drm_intel_bo_unreference(gpgpu->scratch_b.bo);
  
+  if(gpgpu->constant_b.bo)
+    drm_intel_bo_unreference(gpgpu->constant_b.bo);
+
    intel_batchbuffer_delete(gpgpu->batch);
    cl_free(gpgpu);
  }
@@ -231,7 +235,7 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
  }
  
  static void
-intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu)
+intel_gpgpu_load_curbe_buffer(intel_gpgpu_t *gpgpu)
  {
    BEGIN_BATCH(gpgpu->batch, 4);
    OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
@@ -319,7 +323,7 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
    intel_gpgpu_select_pipeline(gpgpu);
    intel_gpgpu_set_base_address(gpgpu);
    intel_gpgpu_load_vfe_state(gpgpu);
-  intel_gpgpu_load_constant_buffer(gpgpu);
+  intel_gpgpu_load_curbe_buffer(gpgpu);
    intel_gpgpu_load_idrt(gpgpu);
  
    if (gpgpu->perf_b.bo) {
@@ -391,7 +395,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
    /* Binded buffers */
    gpgpu->binded_n = 0;
    gpgpu->img_bitmap = 0;
-  gpgpu->img_index_base = 2;
+  gpgpu->img_index_base = 3;
    gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1);
  
    /* URB */
@@ -399,12 +403,12 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
    gpgpu->urb.size_cs_entry = size_cs_entry;
    gpgpu->max_threads = max_threads;
  
-  /* Constant buffer */
+  /* Constant URB  buffer */
    if(gpgpu->curbe_b.bo)
      dri_bo_unreference(gpgpu->curbe_b.bo);
    uint32_t size_cb = gpgpu->urb.num_cs_entries * gpgpu->urb.size_cs_entry * 64;
    size_cb = ALIGN(size_cb, 4096);
-  bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", size_cb, 64);
+  bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CURBE_BUFFER", size_cb, 64);
    assert(bo);
    gpgpu->curbe_b.bo = bo;
  
@@ -468,6 +472,39 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_
                      obj_bo);
  }
  
+static dri_bo*
+intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size)
+{
+  uint32_t s = size - 1;
+  assert(size != 0);
+
+  surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
+  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[2];
+  memset(ss2, 0, sizeof(gen7_surface_state_t));
+  ss2->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss2->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+  ss2->ss2.width  = s & 0x7f;            /* bits 6:0 of sz */
+  ss2->ss2.height = (s >> 7) & 0x3fff;   /* bits 20:7 of sz */
+  ss2->ss3.depth  = (s >> 21) & 0x3ff;   /* bits 30:21 of sz */
+  ss2->ss5.cache_control = cc_llc_l3;
+  heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t);
+
+  if(gpgpu->constant_b.bo)
+    dri_bo_unreference(gpgpu->constant_b.bo);
+  gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", s, 64);
+  assert(gpgpu->constant_b.bo);
+  ss2->ss1.base_addr = gpgpu->constant_b.bo->offset;
+  dri_bo_emit_reloc(gpgpu->surface_heap_b.bo,
+                      I915_GEM_DOMAIN_RENDER,
+                      I915_GEM_DOMAIN_RENDER,
+                      0,
+                      heap->binding_table[2] +
+                      offsetof(gen7_surface_state_t, ss1),
+                      gpgpu->constant_b.bo);
+  return gpgpu->constant_b.bo;
+}
+
+
  /* Map address space with two 2GB surfaces. One surface for untyped message and
   * one surface for byte scatters / gathers. Actually the HW does not require two
   * surfaces but Fulsim complains
@@ -613,7 +650,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
    desc->desc2.sampler_state_pointer = gpgpu->sampler_state_b.bo->offset >> 5;
    desc->desc3.binding_table_entry_count = 0; /* no prefetch */
    desc->desc3.binding_table_pointer = 0;
-  desc->desc4.curbe_read_len = kernel->cst_sz / 32;
+  desc->desc4.curbe_read_len = kernel->curbe_sz / 32;
    desc->desc4.curbe_read_offset = 0;
  
    /* Barriers / SLM are automatically handled on Gen7+ */
@@ -652,7 +689,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
  }
  
  static void
-intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
+intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
  {
    unsigned char *curbe = NULL;
    cl_gpgpu_kernel *k = gpgpu->ker;
@@ -667,9 +704,9 @@ intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const void* data, uint32_t si
    /* Now put all the relocations for our flat address space */
    for (i = 0; i < k->thread_n; ++i)
      for (j = 0; j < gpgpu->binded_n; ++j) {
-      *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->cst_sz) = gpgpu->binded_buf[j]->offset;
+      *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset;
        drm_intel_bo_emit_reloc(gpgpu->curbe_b.bo,
-                              gpgpu->binded_offset[j]+i*k->cst_sz,
+                              gpgpu->binded_offset[j]+i*k->curbe_sz,
                                gpgpu->binded_buf[j],
                                0,
                                I915_GEM_DOMAIN_RENDER,
@@ -927,7 +964,8 @@ intel_set_gpgpu_callbacks(void)
    cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
    cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
    cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
-  cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) intel_gpgpu_upload_constants;
+  cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes;
+  cl_gpgpu_alloc_constant_buffer  = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer;
    cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
    cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
    cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset;
author	Ruiling Song <ruiling.song@intel.com>
	Wed, 4 Sep 2013 06:24:54 +0000 (14:24 +0800)
committer	Zhigang Gong <zhigang.gong@linux.intel.com>
	Wed, 4 Sep 2013 06:49:27 +0000 (14:49 +0800)
backend/src/backend/context.cpp		patch \| blob \| history
backend/src/backend/gen_insn_selection.cpp		patch \| blob \| history
backend/src/backend/gen_reg_allocation.cpp		patch \| blob \| history
backend/src/backend/program.h		patch \| blob \| history
backend/src/ir/profile.cpp		patch \| blob \| history
backend/src/ir/profile.hpp		patch \| blob \| history
backend/src/llvm/llvm_gen_backend.cpp		patch \| blob \| history
src/cl_command_queue.c		patch \| blob \| history
src/cl_command_queue.h		patch \| blob \| history
src/cl_command_queue_gen7.c		patch \| blob \| history
src/cl_driver.h		patch \| blob \| history
src/cl_driver_defs.c		patch \| blob \| history
src/cl_gt_device.h		patch \| blob \| history
src/cl_kernel.c		patch \| blob \| history
src/intel/intel_driver.c		patch \| blob \| history
src/intel/intel_gpgpu.c		patch \| blob \| history