From 10924b4bb21d527e8f1ebb3392d7f8e44bee1415 Mon Sep 17 00:00:00 2001
From: Zhigang Gong <zhigang.gong@linux.intel.com>
Date: Wed, 25 Sep 2013 11:20:45 +0800
Subject: [PATCH] GBE: refact the curbe register payload allocation.

As we already handle all the used curbe registers when we build
the patchlist. We can easily create a set to store all the required
curbe registers, and then latter at register allocation stage, we
can easily insert the registers in that set without any other checking.

This way, at register allocation stage, we don't need to know anything
about those CURBE magic number. We only need to use the virtual register
as key naturally. This make the code a little bit clearer.

And most important, this change is to support dynamic curbe register
allocation. For example, the image attributes. Each image may have 6 DWs,
and we may have many images but only access part of the image and part
of the image attributes. So we can't just simply allocate a special
register for all the image attributes. We need to dynamic allocate
curbe registers on demand. So the previous implementation is not
satisfy this requirment. So I have to make this change.

Signed-off-by: Zhigang Gong <zhigang.gong@linux.intel.com>
Reviewed-by: "Yang, Rong R" <rong.r.yang@intel.com>
---
 backend/src/backend/context.cpp            | 37 ++++++------
 backend/src/backend/context.hpp            |  5 +-
 backend/src/backend/gen_reg_allocation.cpp | 93 +++++++++++-------------------
 3 files changed, 55 insertions(+), 80 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index a55ef04..2cacd07 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -366,7 +366,7 @@ namespace gbe
     this->kernel->stackSize = 1*KB; // XXX compute that in a better way
   }
 
-  void Context::newCurbeEntry(gbe_curbe_type value,
+  uint32_t Context::newCurbeEntry(gbe_curbe_type value,
                               uint32_t subValue,
                               uint32_t size,
                               uint32_t alignment)
@@ -376,6 +376,7 @@ namespace gbe
     GBE_ASSERT(offset >= GEN_REG_SIZE);
     kernel->patches.push_back(PatchInfo(value, subValue, offset - GEN_REG_SIZE));
     kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE);
+    return offset;
   }
 
   uint32_t Context::getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size)
@@ -392,12 +393,17 @@ namespace gbe
     return offset;
   }
 
+
+  void Context::insertCurbeReg(ir::Register reg, uint32_t offset) {
+    curbeRegs.insert(std::make_pair(reg, offset));
+  }
+
   void Context::buildPatchList(void) {
     const uint32_t ptrSize = unit.getPointerSize() == ir::POINTER_32_BITS ? 4u : 8u;
     kernel->curbeSize = 0u;
 
     // We insert the block IP mask first
-    this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t));
+    this->insertCurbeReg(ir::ocl::blockip, this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t)));
 
     // Go over the arguments and find the related patch locations
     const uint32_t argNum = fn.argNum();
@@ -411,28 +417,22 @@ namespace gbe
           arg.type == ir::FunctionArgument::STRUCTURE ||
           arg.type == ir::FunctionArgument::IMAGE ||
           arg.type == ir::FunctionArgument::SAMPLER)
-        this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, ptrSize);
+        this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, ptrSize));
     }
 
     // Already inserted registers go here
-    set<ir::Register> specialRegs;
-
     const size_t localIDSize = sizeof(uint32_t) * this->simdWidth;
-    this->newCurbeEntry(GBE_CURBE_LOCAL_ID_X, 0, localIDSize);
-    this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Y, 0, localIDSize);
-    this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Z, 0, localIDSize);
-    this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32);
-    specialRegs.insert(ir::ocl::lid0);
-    specialRegs.insert(ir::ocl::lid1);
-    specialRegs.insert(ir::ocl::lid2);
-    specialRegs.insert(ir::ocl::samplerinfo);
+    insertCurbeReg(ir::ocl::lid0, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_X, 0, localIDSize));
+    insertCurbeReg(ir::ocl::lid1, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Y, 0, localIDSize));
+    insertCurbeReg(ir::ocl::lid2, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Z, 0, localIDSize));
+    insertCurbeReg(ir::ocl::samplerinfo, this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32));
 
     // Go over all the instructions and find the special register we need
     // to push
 #define INSERT_REG(SPECIAL_REG, PATCH, WIDTH) \
   if (reg == ir::ocl::SPECIAL_REG) { \
-    if (specialRegs.find(reg) != specialRegs.end()) continue; \
-    this->newCurbeEntry(GBE_CURBE_##PATCH, 0, ptrSize * WIDTH); \
+    if (curbeRegs.find(reg) != curbeRegs.end()) continue; \
+    insertCurbeReg(reg, this->newCurbeEntry(GBE_CURBE_##PATCH, 0, ptrSize * WIDTH)); \
   } else
 
     bool useStackPtr = false;
@@ -441,7 +441,7 @@ namespace gbe
       for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
         const ir::Register reg = insn.getSrc(srcID);
         if (fn.isSpecialReg(reg) == false) continue;
-        if (specialRegs.contains(reg) == true) continue;
+        if (curbeRegs.find(reg) != curbeRegs.end()) continue;
         if (reg == ir::ocl::stackptr) useStackPtr = true;
         INSERT_REG(lsize0, LOCAL_SIZE_X, 1)
         INSERT_REG(lsize1, LOCAL_SIZE_Y, 1)
@@ -458,17 +458,16 @@ namespace gbe
         INSERT_REG(numgroup2, GROUP_NUM_Z, 1)
         INSERT_REG(stackptr, STACK_POINTER, this->simdWidth)
         do {} while (0);
-        specialRegs.insert(reg);
       }
     });
 #undef INSERT_REG
 
     // Insert the number of threads
-    this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t));
+    insertCurbeReg(ir::ocl::threadn, this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t)));
 
     // Insert the stack buffer if used
     if (useStackPtr)
-      this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize);
+      insertCurbeReg(ir::ocl::stackptr, this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize));
 
     // After this point the vector is immutable. Sorting it will make
     // research faster
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 50c0e70..ca2c88d 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -93,6 +93,8 @@ namespace gbe
     uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
     /*! allocate size scratch memory and return start address */
     uint32_t allocateScratchMem(uint32_t size);
+    /*! Preallocated curbe register set including special registers. */
+    map<ir::Register, uint32_t> curbeRegs;
   protected:
     /*! Build the instruction stream. Return false if failed */
     virtual bool emitCode(void) = 0;
@@ -115,7 +117,8 @@ namespace gbe
     /*! Insert a new entry with the given size in the Curbe. Return the offset
      *  of the entry
      */
-    void newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment = 0);
+    void insertCurbeReg(ir::Register, uint32_t grfOffset);
+    uint32_t newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment = 0);
     /*! Provide for each branch and label the label index target */
     typedef map<const ir::Instruction*, ir::LabelIndex> JIPMap;
     const ir::Unit &unit;                 //!< Unit that contains the kernel
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index a9132df..a72333d 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -65,8 +65,10 @@ namespace gbe
     void allocateFlags(Selection &selection);
     /*! Allocate the GRF registers */
     bool allocateGRFs(Selection &selection);
+    /*! Create gen registers for all preallocated curbe registers. */
+    void allocatePayloadRegs(void);
     /*! Create a Gen register from a register set in the payload */
-    void allocatePayloadReg(gbe_curbe_type, ir::Register, uint32_t subValue = 0, uint32_t subOffset = 0);
+    void allocatePayloadReg(ir::Register, uint32_t offset, uint32_t subOffset = 0);
     /*! Create the intervals for each register */
     /*! Allocate the vectors detected in the instruction selection pass */
     void allocateVector(Selection &selection);
@@ -124,19 +126,37 @@ namespace gbe
   GenRegAllocator::Opaque::Opaque(GenContext &ctx) : ctx(ctx) {}
   GenRegAllocator::Opaque::~Opaque(void) {}
 
-  void GenRegAllocator::Opaque::allocatePayloadReg(gbe_curbe_type value,
-                                                   ir::Register reg,
-                                                   uint32_t subValue,
+  void GenRegAllocator::Opaque::allocatePayloadReg(ir::Register reg,
+                                                   uint32_t offset,
                                                    uint32_t subOffset)
   {
     using namespace ir;
-    const Kernel *kernel = ctx.getKernel();
-    const int32_t curbeOffset = kernel->getCurbeOffset(value, subValue);
-    if (curbeOffset >= 0) {
-      const uint32_t offset = GEN_REG_SIZE + curbeOffset + subOffset;
-      RA.insert(std::make_pair(reg, offset));
-      this->intervals[reg].minID = 0;
-      this->intervals[reg].maxID = 0;
+    assert(offset >= GEN_REG_SIZE);
+    offset += subOffset;
+    RA.insert(std::make_pair(reg, offset));
+    GBE_ASSERT(reg != ocl::blockip || (offset % GEN_REG_SIZE == 0));
+    this->intervals[reg].minID = 0;
+    this->intervals[reg].maxID = 0;
+  }
+
+  INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) {
+    using namespace ir;
+    for(auto &it : this->ctx.curbeRegs)
+      allocatePayloadReg(it.first, it.second);
+
+    // Allocate all pushed registers (i.e. structure kernel arguments)
+    const Function &fn = ctx.getFunction();
+    GBE_ASSERT(fn.getProfile() == PROFILE_OCL);
+    const Function::PushMap &pushMap = fn.getPushMap();
+    for (const auto &pushed : pushMap) {
+      const uint32_t argID = pushed.second.argID;
+      const FunctionArgument arg = fn.getArg(argID);
+
+      const uint32_t subOffset = pushed.second.offset;
+      const Register reg = pushed.second.getRegister();
+      auto it = this->ctx.curbeRegs.find(arg.reg);
+      assert(it != ctx.curbeRegs.end());
+      allocatePayloadReg(reg, it->second, subOffset);
     }
   }
 
@@ -535,11 +555,9 @@ namespace gbe
     }
     return true;
   }
+
   INLINE bool GenRegAllocator::Opaque::allocate(Selection &selection) {
     using namespace ir;
-    const Kernel *kernel = ctx.getKernel();
-    const Function &fn = ctx.getFunction();
-    GBE_ASSERT(fn.getProfile() == PROFILE_OCL);
     if (ctx.getSimdWidth() == 8) {
       reservedReg = ctx.allocate(RESERVED_REG_NUM_FOR_SPILL * GEN_REG_SIZE, GEN_REG_SIZE);
       reservedReg /= GEN_REG_SIZE;
@@ -555,25 +573,7 @@ namespace gbe
       this->intervals.push_back(ir::Register(regID));
 
     // Allocate the special registers (only those which are actually used)
-    allocatePayloadReg(GBE_CURBE_LOCAL_ID_X, ocl::lid0);
-    allocatePayloadReg(GBE_CURBE_LOCAL_ID_Y, ocl::lid1);
-    allocatePayloadReg(GBE_CURBE_LOCAL_ID_Z, ocl::lid2);
-    allocatePayloadReg(GBE_CURBE_LOCAL_SIZE_X, ocl::lsize0);
-    allocatePayloadReg(GBE_CURBE_LOCAL_SIZE_Y, ocl::lsize1);
-    allocatePayloadReg(GBE_CURBE_LOCAL_SIZE_Z, ocl::lsize2);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_SIZE_X, ocl::gsize0);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_SIZE_Y, ocl::gsize1);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_SIZE_Z, ocl::gsize2);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_X, ocl::goffset0);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Y, ocl::goffset1);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Z, ocl::goffset2);
-    allocatePayloadReg(GBE_CURBE_WORK_DIM, ocl::workdim);
-    allocatePayloadReg(GBE_CURBE_SAMPLER_INFO, ocl::samplerinfo);
-    allocatePayloadReg(GBE_CURBE_GROUP_NUM_X, ocl::numgroup0);
-    allocatePayloadReg(GBE_CURBE_GROUP_NUM_Y, ocl::numgroup1);
-    allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2);
-    allocatePayloadReg(GBE_CURBE_STACK_POINTER, ocl::stackptr);
-    allocatePayloadReg(GBE_CURBE_THREAD_NUM, ocl::threadn);
+    this->allocatePayloadRegs();
 
     // Group and barrier IDs are always allocated by the hardware in r0
     RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
@@ -582,33 +582,6 @@ namespace gbe
     RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
 
     // block IP used to handle the mask in SW is always allocated
-    const int32_t blockIPOffset = GEN_REG_SIZE + kernel->getCurbeOffset(GBE_CURBE_BLOCK_IP,0);
-    GBE_ASSERT(blockIPOffset >= 0 && blockIPOffset % GEN_REG_SIZE == 0);
-    RA.insert(std::make_pair(ocl::blockip, blockIPOffset));
-    this->intervals[ocl::blockip].minID = 0;
-
-    // Allocate all (non-structure) argument parameters
-    const uint32_t argNum = fn.argNum();
-    for (uint32_t argID = 0; argID < argNum; ++argID) {
-      const FunctionArgument &arg = fn.getArg(argID);
-      GBE_ASSERT(arg.type == FunctionArgument::GLOBAL_POINTER ||
-                 arg.type == FunctionArgument::CONSTANT_POINTER ||
-                 arg.type == FunctionArgument::LOCAL_POINTER ||
-                 arg.type == FunctionArgument::VALUE ||
-                 arg.type == FunctionArgument::STRUCTURE ||
-                 arg.type == FunctionArgument::IMAGE ||
-                 arg.type == FunctionArgument::SAMPLER);
-      allocatePayloadReg(GBE_CURBE_KERNEL_ARGUMENT, arg.reg, argID);
-    }
-
-    // Allocate all pushed registers (i.e. structure kernel arguments)
-    const Function::PushMap &pushMap = fn.getPushMap();
-    for (const auto &pushed : pushMap) {
-      const uint32_t argID = pushed.second.argID;
-      const uint32_t subOffset = pushed.second.offset;
-      const Register reg = pushed.second.getRegister();
-      allocatePayloadReg(GBE_CURBE_KERNEL_ARGUMENT, reg, argID, subOffset);
-    }
 
     // Compute the intervals
     int32_t insnID = 0;
-- 
2.7.4