HSW: Corret the scratch buffer size calc and set the correct index in vfe state.

author Yang Rong <rong.r.yang@intel.com>

Mon, 19 May 2014 05:52:25 +0000 (13:52 +0800)

committer Zhigang Gong <zhigang.gong@intel.com>

Mon, 19 May 2014 04:54:22 +0000 (12:54 +0800)
author Yang Rong <rong.r.yang@intel.com>
Mon, 19 May 2014 05:52:25 +0000 (13:52 +0800)
committer Zhigang Gong <zhigang.gong@intel.com>
Mon, 19 May 2014 04:54:22 +0000 (12:54 +0800)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp

index 6a0bca2..db968c3 100644 (file)
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -313,16 +313,6 @@ namespace gbe
      allocatedBlocks.insert(std::make_pair(offset + subOffset, size - subOffset));
    }
  
-  static int
-  alignScratchSize(int size){
-    int i = 0;
-
-    for(; i < size; i+=1024)
-      ;
-
-    return i;
-  }
-
    ///////////////////////////////////////////////////////////////////////////
    // Generic Context (shared by the simulator and the HW context)
    ///////////////////////////////////////////////////////////////////////////
@@ -355,7 +345,7 @@ namespace gbe
      GBE_SAFE_DELETE(this->scratchAllocator);
      GBE_ASSERT(dag != NULL && liveness != NULL);
      this->registerAllocator = GBE_NEW(RegisterAllocator, GEN_REG_SIZE, 4*KB - GEN_REG_SIZE);
-    this->scratchAllocator = GBE_NEW(ScratchAllocator, 12*KB);
+    this->scratchAllocator = GBE_NEW(ScratchAllocator, this->getScratchSize());
      this->curbeRegs.clear();
      this->JIPs.clear();
    }
@@ -375,7 +365,7 @@ namespace gbe
        this->kernel = NULL;
      }
      if(this->kernel != NULL) {
-      this->kernel->scratchSize = alignScratchSize(scratchAllocator->getMaxScatchMemUsed());
+      this->kernel->scratchSize = this->alignScratchSize(scratchAllocator->getMaxScatchMemUsed());
        this->kernel->ctx = this;
      }
      return this->kernel;
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp

index d4dcfca..2a37a0e 100644 (file)
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -103,6 +103,10 @@ namespace gbe
    protected:
      /*! Build the instruction stream. Return false if failed */
      virtual bool emitCode(void) = 0;
+    /*! Align the scratch size to the device's scratch unit size */
+    virtual uint32_t alignScratchSize(uint32_t) = 0;
+    /*! Get the device's max srcatch size */
+    virtual uint32_t getScratchSize(void) = 0;
      /*! Allocate a new empty kernel (to be implemented) */
      virtual Kernel *allocateKernel(void) = 0;
      /*! Look if a stack is needed and allocate it */
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp

index f22a6ab..aedd4d3 100644 (file)
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -54,6 +54,58 @@ namespace gbe
        allocCurbeReg(ir::ocl::slmoffset, GBE_CURBE_SLM_OFFSET);
    }
  
+  uint32_t Gen75Context::alignScratchSize(uint32_t size){
+    if(size == 0)
+      return 0;
+    uint32_t i = 2048;
+    while(i < size) i *= 2;
+    return i;
+  }
+
+  void Gen75Context::emitStackPointer(void) {
+    using namespace ir;
+
+    // Only emit stack pointer computation if we use a stack
+    if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
+      return;
+
+    // Check that everything is consistent in the kernel code
+    const uint32_t perLaneSize = kernel->getStackSize();
+    const uint32_t perThreadSize = perLaneSize * this->simdWidth;
+    GBE_ASSERT(perLaneSize > 0);
+    GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
+    GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
+
+    // Use shifts rather than muls which are limited to 32x16 bit sources
+    const uint32_t perLaneShift = logi2(perLaneSize);
+    const uint32_t perThreadShift = logi2(perThreadSize);
+    const GenRegister selStatckPtr = this->simdWidth == 8 ?
+      GenRegister::ud8grf(ir::ocl::stackptr) :
+      GenRegister::ud16grf(ir::ocl::stackptr);
+    const GenRegister stackptr = ra->genReg(selStatckPtr);
+    const GenRegister selStackBuffer = GenRegister::ud1grf(ir::ocl::stackbuffer);
+    const GenRegister bufferptr = ra->genReg(selStackBuffer);
+
+    // We compute the per-lane stack pointer here
+    p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
+      p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
+      p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x80));
+      p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
+      p->curr.execWidth = this->simdWidth;
+      p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
+      p->curr.execWidth = 1;
+      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(1));
+      p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
+      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
+      p->curr.execWidth = this->simdWidth;
+      p->ADD(stackptr, stackptr, bufferptr);
+      p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
+    p->pop();
+  }
+
    void Gen75Context::newSelection(void) {
      this->sel = GBE_NEW(Selection75, *this);
    }
diff --git a/backend/src/backend/gen75_context.hpp b/backend/src/backend/gen75_context.hpp

index 1debe7b..291e260 100644 (file)
--- a/backend/src/backend/gen75_context.hpp
+++ b/backend/src/backend/gen75_context.hpp
@@ -34,7 +34,19 @@ namespace gbe
    public:
      virtual ~Gen75Context(void) { }
      Gen75Context(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
-            : GenContext(unit, name, deviceID, relaxMath) { }
+            : GenContext(unit, name, deviceID, relaxMath) {
+    };
+    /*! device's max srcatch buffer size */
+    const int GEN75_SCRATCH_SIZE = 2 * KB * KB;
+    /*! Emit the per-lane stack pointer computation */
+    virtual void emitStackPointer(void);
+    /*! Align the scratch size to the device's scratch unit size */
+    virtual uint32_t alignScratchSize(uint32_t size);
+    /*! Get the device's max srcatch size */
+    virtual uint32_t getScratchSize(void) {
+      //Because the allocate is use uint16_t, so clamp it, need refine
+      return std::min(GEN75_SCRATCH_SIZE, 0x7fff);
+    }
  
    protected:
      virtual GenEncoder* generateEncoder(void) {
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp

index 367e48b..f4c80e3 100644 (file)
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -79,6 +79,12 @@ namespace gbe
      this->sel = GBE_NEW(Selection, *this);
    }
  
+  uint32_t GenContext::alignScratchSize(uint32_t size){
+    uint32_t i = 0;
+    while(i < size) i+=1024;
+    return i;
+  }
+
    void GenContext::emitInstructionStream(void) {
      // Emit Gen ISA
      for (auto &block : *sel->blockList)
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp

index 149d526..4a08ed7 100644 (file)
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -64,12 +64,18 @@ namespace gbe
                 bool relaxMath = false);
      /*! Release everything needed */
      virtual ~GenContext(void);
+    /*! device's max srcatch buffer size */
+    const int GEN7_SCRATCH_SIZE = 12 * KB;
      /*! Start new code generation with specific parameters */
      void startNewCG(uint32_t simdWidth, uint32_t reservedSpillRegs, bool limitRegisterPressure);
      /*! Target device ID*/
      uint32_t deviceID;
      /*! Implements base class */
      virtual bool emitCode(void);
+    /*! Align the scratch size to the device's scratch unit size */
+    virtual uint32_t alignScratchSize(uint32_t size);
+    /*! Get the device's max srcatch size */
+    virtual uint32_t getScratchSize(void) { return GEN7_SCRATCH_SIZE; }
      /*! Function we emit code for */
      INLINE const ir::Function &getFunction(void) const { return fn; }
      /*! Simd width chosen for the current function */
@@ -78,7 +84,7 @@ namespace gbe
      /*! check the flag reg, if is grf, use f0.1 instead */
      GenRegister checkFlagRegister(GenRegister flagReg);
      /*! Emit the per-lane stack pointer computation */
-    void emitStackPointer(void);
+    virtual void emitStackPointer(void);
      /*! Emit the instructions */
      void emitInstructionStream(void);
      /*! Set the correct target values for the branches */
diff --git a/src/cl_device_id.c b/src/cl_device_id.c

index 268789a..4dc74cd 100644 (file)
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -71,8 +71,8 @@ static struct _cl_device_id intel_baytrail_t_device = {
  /* XXX we clone IVB for HSW now */
  static struct _cl_device_id intel_hsw_gt1_device = {
    INIT_ICD(dispatch)
-  .max_compute_unit = 64,
-  .max_thread_per_unit = 8,
+  .max_compute_unit = 70,
+  .max_thread_per_unit = 7,
    .max_work_item_sizes = {512, 512, 512},
    .max_work_group_size = 512,
    .max_clock_frequency = 1000,
@@ -82,8 +82,8 @@ static struct _cl_device_id intel_hsw_gt1_device = {
  
  static struct _cl_device_id intel_hsw_gt2_device = {
    INIT_ICD(dispatch)
-  .max_compute_unit = 128,
-  .max_thread_per_unit = 8,
+  .max_compute_unit = 140,
+  .max_thread_per_unit = 7,
    .max_work_item_sizes = {512, 512, 512},
    .max_work_group_size = 512,
    .max_clock_frequency = 1000,
@@ -93,8 +93,8 @@ static struct _cl_device_id intel_hsw_gt2_device = {
  
  static struct _cl_device_id intel_hsw_gt3_device = {
    INIT_ICD(dispatch)
-  .max_compute_unit = 256,
-  .max_thread_per_unit = 8,
+  .max_compute_unit = 280,
+  .max_thread_per_unit = 7,
    .max_work_item_sizes = {512, 512, 512},
    .max_work_group_size = 512,
    .max_clock_frequency = 1000,
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c

index 6a640b8..b7b712f 100644 (file)
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -121,6 +121,9 @@ typedef struct intel_gpgpu intel_gpgpu_t;
  typedef void (intel_gpgpu_set_L3_t)(intel_gpgpu_t *gpgpu, uint32_t use_slm);
  intel_gpgpu_set_L3_t *intel_gpgpu_set_L3 = NULL;
  
+typedef uint32_t (get_scratch_index_t)(uint32_t size);
+get_scratch_index_t *get_scratch_index = NULL;
+
  static void
  intel_gpgpu_sync(void *buf)
  {
@@ -230,17 +233,34 @@ intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
    ADVANCE_BATCH(gpgpu->batch);
  }
  
+uint32_t get_scratch_index_gen7(uint32_t size) {
+  return size / 1024 - 1;
+}
+
+uint32_t get_scratch_index_gen75(uint32_t size) {
+    size = size >> 12;
+    uint32_t index = 0;
+    while((size >>= 1) > 0)
+      index++;   //get leading one
+
+    //non pow 2 size
+    if(size & (size - 1)) index++;
+    return index;
+}
+
  static void
  intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
  {
+  int32_t scratch_index;
    BEGIN_BATCH(gpgpu->batch, 8);
    OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
  
    if(gpgpu->per_thread_scratch > 0) {
+    scratch_index = get_scratch_index(gpgpu->per_thread_scratch);
      OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
                I915_GEM_DOMAIN_RENDER,
                I915_GEM_DOMAIN_RENDER,
-              gpgpu->per_thread_scratch/1024 - 1);
+              scratch_index);
    }
    else {
      OUT_BATCH(gpgpu->batch, 0);
@@ -359,9 +379,6 @@ intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm)
      OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
      ADVANCE_BATCH(gpgpu->batch);
  
-  //To set L3 in HSW, enable the flag I915_EXEC_ENABLE_SLM flag when exec
-  if(use_slm)
-    gpgpu->batch->enable_slm = 1;
    intel_gpgpu_pipe_control(gpgpu);
  }
  
@@ -1158,10 +1175,12 @@ intel_set_gpgpu_callbacks(int device_id)
    if (IS_HASWELL(device_id)) {
      cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75;
      intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen75;
+    get_scratch_index = get_scratch_index_gen75;
    }
    else if (IS_IVYBRIDGE(device_id)) {
      cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
      intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen7;
+    get_scratch_index = get_scratch_index_gen7;
    }
    else
      assert(0);
author	Yang Rong <rong.r.yang@intel.com>
	Mon, 19 May 2014 05:52:25 +0000 (13:52 +0800)
committer	Zhigang Gong <zhigang.gong@intel.com>
	Mon, 19 May 2014 04:54:22 +0000 (12:54 +0800)
backend/src/backend/context.cpp		patch \| blob \| history
backend/src/backend/context.hpp		patch \| blob \| history
backend/src/backend/gen75_context.cpp		patch \| blob \| history
backend/src/backend/gen75_context.hpp		patch \| blob \| history
backend/src/backend/gen_context.cpp		patch \| blob \| history
backend/src/backend/gen_context.hpp		patch \| blob \| history
src/cl_device_id.c		patch \| blob \| history
src/intel/intel_gpgpu.c		patch \| blob \| history