GBE: Refine bti usage in backend & runtime.

author Ruiling Song <ruiling.song@intel.com>

Wed, 30 Jul 2014 05:59:29 +0000 (13:59 +0800)

committer Zhigang Gong <zhigang.gong@intel.com>

Wed, 30 Jul 2014 07:23:43 +0000 (15:23 +0800)
author Ruiling Song <ruiling.song@intel.com>
Wed, 30 Jul 2014 05:59:29 +0000 (13:59 +0800)
committer Zhigang Gong <zhigang.gong@intel.com>
Wed, 30 Jul 2014 07:23:43 +0000 (15:23 +0800)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp

index 831421d..a9d5d4a 100644 (file)
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -436,6 +436,12 @@ namespace gbe
    void Context::insertCurbeReg(ir::Register reg, uint32_t offset) {
      curbeRegs.insert(std::make_pair(reg, offset));
    }
+  ir::Register Context::getSurfaceBaseReg(unsigned char bti) {
+    map<unsigned char, ir::Register>::iterator iter;
+    iter = btiRegMap.find(bti);
+    GBE_ASSERT(iter != btiRegMap.end());
+    return iter->second;
+  }
  
    void Context::buildArgList(void) {
      kernel->argNum = fn.argNum();
@@ -443,6 +449,8 @@ namespace gbe
        kernel->args = GBE_NEW_ARRAY_NO_ARG(KernelArgument, kernel->argNum);
      else
        kernel->args = NULL;
+    btiRegMap.clear();
+    btiRegMap.insert(std::make_pair(1, ir::ocl::stackbuffer));
      for (uint32_t argID = 0; argID < kernel->argNum; ++argID) {
        const auto &arg = fn.getArg(argID);
  
@@ -457,6 +465,8 @@ namespace gbe
          case ir::FunctionArgument::GLOBAL_POINTER:
            kernel->args[argID].type = GBE_ARG_GLOBAL_PTR;
            kernel->args[argID].size = sizeof(void*);
+          kernel->args[argID].bti = arg.bti;
+          btiRegMap.insert(std::make_pair(arg.bti, arg.reg));
            break;
          case ir::FunctionArgument::CONSTANT_POINTER:
            kernel->args[argID].type = GBE_ARG_CONSTANT_PTR;
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp

index 24f2873..f8ee4ae 100644 (file)
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -99,6 +99,7 @@ namespace gbe
      void deallocateScratchMem(int32_t offset);
      /*! Preallocated curbe register set including special registers. */
      map<ir::Register, uint32_t> curbeRegs;
+    ir::Register getSurfaceBaseReg(unsigned char bti);
    protected:
      /*! Build the instruction stream. Return false if failed */
      virtual bool emitCode(void) = 0;
@@ -139,6 +140,7 @@ namespace gbe
      set<ir::LabelIndex> usedLabels;       //!< Set of all used labels
      JIPMap JIPs;                          //!< Where to jump all labels/branches
      uint32_t simdWidth;                   //!< Number of lanes per HW threads
+    map<unsigned char, ir::Register> btiRegMap;
      GBE_CLASS(Context);                   //!< Use custom allocators
    };
  
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp

index 348aca4..23f99ca 100644 (file)
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2714,25 +2714,55 @@ namespace gbe
    /*! Load instruction pattern */
    DECL_PATTERN(LoadInstruction)
    {
+    void readDWord(Selection::Opaque &sel,
+                   vector<GenRegister> &dst,
+                   vector<GenRegister> &dst2,
+                   GenRegister addr,
+                   uint32_t valueNum,
+                   ir::AddressSpace space,
+                   ir::BTI bti) const
+    {
+      for (uint32_t x = 0; x < bti.count; x++) {
+        if(x > 0)
+          for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+            dst2[dstID] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+
+        GenRegister temp = getRelativeAddress(sel, addr, space, bti.bti[x]);
+        sel.UNTYPED_READ(temp, dst2.data(), valueNum, bti.bti[x]);
+        if(x > 0) {
+          sel.push();
+            if(sel.isScalarReg(dst[0].reg())) {
+              sel.curr.noMask = 1;
+              sel.curr.execWidth = 1;
+            }
+            for (uint32_t y = 0; y < valueNum; y++)
+              sel.ADD(dst[y], dst[y], dst2[y]);
+          sel.pop();
+        }
+      }
+    }
+
      void emitUntypedRead(Selection::Opaque &sel,
                           const ir::LoadInstruction &insn,
                           GenRegister addr,
-                         uint32_t bti) const
+                         ir::BTI bti) const
      {
        using namespace ir;
        const uint32_t valueNum = insn.getValueNum();
        vector<GenRegister> dst(valueNum);
+      vector<GenRegister> dst2(valueNum);
        for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
-        dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
-      sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
+        dst2[dstID] = dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
+      readDWord(sel, dst, dst2, addr, valueNum, insn.getAddressSpace(), bti);
      }
  
      void emitDWordGather(Selection::Opaque &sel,
                           const ir::LoadInstruction &insn,
                           GenRegister addr,
-                         uint32_t bti) const
+                         ir::BTI bti) const
      {
        using namespace ir;
+      GBE_ASSERT(bti.count == 1);
        const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ? 1 : sel.ctx.getSimdWidth();
        GBE_ASSERT(insn.getValueNum() == 1);
  
@@ -2740,7 +2770,7 @@ namespace gbe
          GenRegister dst = sel.selReg(insn.getValue(0), ir::TYPE_U32);
          sel.push();
            sel.curr.noMask = 1;
-          sel.SAMPLE(&dst, 1, &addr, 1, bti, 0, true, true);
+          sel.SAMPLE(&dst, 1, &addr, 1, bti.bti[0], 0, true, true);
          sel.pop();
          return;
        }
@@ -2756,63 +2786,34 @@ namespace gbe
          sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
        sel.pop();
  
-      sel.DWORD_GATHER(dst, addrDW, bti);
+      sel.DWORD_GATHER(dst, addrDW, bti.bti[0]);
      }
  
      void emitRead64(Selection::Opaque &sel,
                           const ir::LoadInstruction &insn,
                           GenRegister addr,
-                         uint32_t bti) const
+                         ir::BTI bti) const
      {
        using namespace ir;
        const uint32_t valueNum = insn.getValueNum();
        /* XXX support scalar only right now. */
        GBE_ASSERT(valueNum == 1);
-
+      GBE_ASSERT(bti.count == 1);
        GenRegister dst[valueNum];
+      GenRegister tmpAddr = getRelativeAddress(sel, addr, insn.getAddressSpace(), bti.bti[0]);
        for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
          dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
-      sel.READ64(addr, dst, valueNum, bti);
+      sel.READ64(tmpAddr, dst, valueNum, bti.bti[0]);
      }
  
-    void emitByteGather(Selection::Opaque &sel,
-                        const ir::LoadInstruction &insn,
+    void readByteAsDWord(Selection::Opaque &sel,
                          const uint32_t elemSize,
                          GenRegister address,
-                        uint32_t bti) const
+                        GenRegister dst,
+                        uint32_t simdWidth,
+                        uint8_t bti) const
      {
        using namespace ir;
-      const uint32_t valueNum = insn.getValueNum();
-      const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
-                                 1 : sel.ctx.getSimdWidth();
-      if(valueNum > 1) {
-        vector<GenRegister> dst(valueNum);
-        const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
-
-        if(elemSize == GEN_BYTE_SCATTER_WORD) {
-          for(uint32_t i = 0; i < valueNum; i++)
-            dst[i] = sel.selReg(insn.getValue(i), ir::TYPE_U16);
-        } else if(elemSize == GEN_BYTE_SCATTER_BYTE) {
-          for(uint32_t i = 0; i < valueNum; i++)
-            dst[i] = sel.selReg(insn.getValue(i), ir::TYPE_U8);
-        }
-
-        uint32_t tmpRegNum = typeSize*valueNum / 4;
-        vector<GenRegister> tmp(tmpRegNum);
-        for(uint32_t i = 0; i < tmpRegNum; i++) {
-          tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
-        }
-
-        sel.UNTYPED_READ(address, tmp.data(), tmpRegNum, bti);
-
-        for(uint32_t i = 0; i < tmpRegNum; i++) {
-          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
-        }
-     } else {
-        GBE_ASSERT(insn.getValueNum() == 1);
-        const GenRegister value = sel.selReg(insn.getValue(0));
-        GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
-
          Register tmpReg = sel.reg(FAMILY_DWORD, simdWidth == 1);
          GenRegister tmpAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
          GenRegister tmpData = GenRegister::udxgrf(simdWidth, tmpReg);
@@ -2836,10 +2837,65 @@ namespace gbe
            sel.SHR(tmpData, tmpData, tmpAddr);
  
            if (elemSize == GEN_BYTE_SCATTER_WORD)
-            sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), sel.unpacked_uw(tmpReg));
+            sel.MOV(GenRegister::retype(dst, GEN_TYPE_UW), sel.unpacked_uw(tmpReg));
            else if (elemSize == GEN_BYTE_SCATTER_BYTE)
-            sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), sel.unpacked_ub(tmpReg));
+            sel.MOV(GenRegister::retype(dst, GEN_TYPE_UB), sel.unpacked_ub(tmpReg));
          sel.pop();
+    }
+
+    void emitByteGather(Selection::Opaque &sel,
+                        const ir::LoadInstruction &insn,
+                        const uint32_t elemSize,
+                        GenRegister address,
+                        ir::BTI bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
+                                 1 : sel.ctx.getSimdWidth();
+      RegisterFamily family = getFamily(insn.getValueType());
+
+      if(valueNum > 1) {
+        vector<GenRegister> dst(valueNum);
+        const uint32_t typeSize = getFamilySize(family);
+
+        for(uint32_t i = 0; i < valueNum; i++)
+          dst[i] = sel.selReg(insn.getValue(i), getType(family));
+
+        uint32_t tmpRegNum = typeSize*valueNum / 4;
+        vector<GenRegister> tmp(tmpRegNum);
+        vector<GenRegister> tmp2(tmpRegNum);
+        for(uint32_t i = 0; i < tmpRegNum; i++) {
+          tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+        }
+
+        readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
+
+        for(uint32_t i = 0; i < tmpRegNum; i++) {
+          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
+        }
+      } else {
+        GBE_ASSERT(insn.getValueNum() == 1);
+        const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType());
+        GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
+        GenRegister tmp = value;
+
+        for (int x = 0; x < bti.count; x++) {
+          if (x > 0)
+            tmp = sel.selReg(sel.reg(family, simdWidth == 1), insn.getValueType());
+
+          GenRegister addr = getRelativeAddress(sel, address, insn.getAddressSpace(), bti.bti[x]);
+          readByteAsDWord(sel, elemSize, addr, tmp, simdWidth, bti.bti[x]);
+          if (x > 0) {
+            sel.push();
+              if (simdWidth == 1) {
+                sel.curr.noMask = 1;
+                sel.curr.execWidth = 1;
+              }
+              sel.ADD(value, value, tmp);
+            sel.pop();
+          }
+        }
        }
      }
  
@@ -2855,6 +2911,18 @@ namespace gbe
        sel.INDIRECT_MOVE(dst, src);
      }
  
+    INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, ir::AddressSpace space, uint8_t bti) const {
+      if(space == ir::MEM_LOCAL || space == ir::MEM_CONSTANT)
+        return address;
+
+      sel.push();
+        sel.curr.noMask = 1;
+        GenRegister temp = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+        sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti), ir::TYPE_U32)));
+      sel.pop();
+      return temp;
+    }
+
      INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn, bool &markChildren) const {
        using namespace ir;
        GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
@@ -2871,24 +2939,32 @@ namespace gbe
          sel.ADD(temp, address, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
          address = temp;
        }
-      if (insn.getAddressSpace() == MEM_CONSTANT) {
+      BTI bti;
+      if (space == MEM_CONSTANT || space == MEM_LOCAL) {
+        bti.bti[0] = space == MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
+        bti.count = 1;
+      } else {
+        bti = insn.getBTI();
+      }
+      if (space == MEM_CONSTANT) {
          // XXX TODO read 64bit constant through constant cache
          // Per HW Spec, constant cache messages can read at least DWORD data.
          // So, byte/short data type, we have to read through data cache.
          if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-          this->emitRead64(sel, insn, address, 0x2);
+          this->emitRead64(sel, insn, address, bti);
          else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-          this->emitDWordGather(sel, insn, address, 0x2);
+          this->emitDWordGather(sel, insn, address, bti);
          else {
-          this->emitByteGather(sel, insn, elemSize, address, 0x2);
+          this->emitByteGather(sel, insn, elemSize, address, bti);
+        }
+      } else {
+        if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+          this->emitRead64(sel, insn, address, bti);
+        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+          this->emitUntypedRead(sel, insn, address, bti);
+        else {
+          this->emitByteGather(sel, insn, elemSize, address, bti);
          }
-      }
-      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-        this->emitRead64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
-      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-        this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
-      else {
-        this->emitByteGather(sel, insn, elemSize, address, space == MEM_LOCAL ? 0xfe : 0x01);
        }
        return true;
      }
@@ -2977,7 +3053,6 @@ namespace gbe
      {
        using namespace ir;
        const AddressSpace space = insn.getAddressSpace();
-      const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
        const Type type = insn.getValueType();
        const uint32_t elemSize = getByteScatterGatherSize(type);
        GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
@@ -2986,12 +3061,29 @@ namespace gbe
          sel.ADD(temp, address, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
          address = temp;
        }
-      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-        this->emitWrite64(sel, insn, address, bti);
-      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-        this->emitUntypedWrite(sel, insn, address, bti);
-      else {
-        this->emitByteScatter(sel, insn, elemSize, address, bti);
+      if(space == MEM_LOCAL) {
+        if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+          this->emitWrite64(sel, insn, address, 0xfe);
+        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+          this->emitUntypedWrite(sel, insn, address,  0xfe);
+        else
+          this->emitByteScatter(sel, insn, elemSize, address, 0xfe);
+      } else {
+        BTI bti = insn.getBTI();
+        for (int x = 0; x < bti.count; x++) {
+          GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+          sel.push();
+            sel.curr.noMask = 1;
+            sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti.bti[x]), ir::TYPE_U32)));
+          sel.pop();
+          if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+            this->emitWrite64(sel, insn, temp, bti.bti[x]);
+          else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+            this->emitUntypedWrite(sel, insn, temp,  bti.bti[x]);
+          else {
+            this->emitByteScatter(sel, insn, elemSize, temp, bti.bti[x]);
+          }
+        }
        }
        return true;
      }
@@ -3391,20 +3483,32 @@ namespace gbe
        using namespace ir;
        const AtomicOps atomicOp = insn.getAtomicOpcode();
        const AddressSpace space = insn.getAddressSpace();
-      const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
        const uint32_t srcNum = insn.getSrcNum();
+
        GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32);   //address
        GenRegister src1 = src0, src2 = src0;
        if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
        if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
        GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
        GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
-      if(space == MEM_LOCAL && sel.needPatchSLMAddr()){
-        GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-        sel.ADD(temp, src0, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
-        src0 = temp;
+      if(space == MEM_LOCAL) {
+        if (sel.needPatchSLMAddr()) {
+          GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+          sel.ADD(temp, src0, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
+          src0 = temp;
+        }
+        sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, 0xfe);
+      } else {
+        ir::BTI b = insn.getBTI();
+        for (int x = 0; x < b.count; x++) {
+          sel.push();
+            sel.curr.noMask = 1;
+            GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+            sel.ADD(temp, src0, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(b.bti[x]), ir::TYPE_U32)));
+          sel.pop();
+          sel.ATOMIC(dst, genAtomicOp, srcNum, temp, src1, src2, b.bti[x]);
+        }
        }
-      sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, bti);
        return true;
      }
      DECL_CTOR(AtomicInstruction, 1, 1);
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp

index 57ef515..20c1807 100644 (file)
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -260,7 +260,7 @@ namespace gbe {
        OUT_UPDATE_SZ(arg.type);
        OUT_UPDATE_SZ(arg.size);
        OUT_UPDATE_SZ(arg.align);
-      OUT_UPDATE_SZ(arg.bufSize);
+      OUT_UPDATE_SZ(arg.bti);
      }
  
      OUT_UPDATE_SZ(patches.size());
@@ -349,7 +349,7 @@ namespace gbe {
        IN_UPDATE_SZ(arg.type);
        IN_UPDATE_SZ(arg.size);
        IN_UPDATE_SZ(arg.align);
-      IN_UPDATE_SZ(arg.bufSize);
+      IN_UPDATE_SZ(arg.bti);
      }
  
      IN_UPDATE_SZ(patch_num);
@@ -465,7 +465,7 @@ namespace gbe {
        outs << spaces_nl << "      type value: "<< arg.type << "\n";
        outs << spaces_nl << "      size: "<< arg.size << "\n";
        outs << spaces_nl << "      align: "<< arg.align << "\n";
-      outs << spaces_nl << "      bufSize: "<< arg.bufSize << "\n";
+      outs << spaces_nl << "      bti: "<< arg.bti << "\n";
      }
  
      outs << spaces_nl << "  Patches Number is " << patches.size() << "\n";
@@ -1016,6 +1016,12 @@ namespace gbe {
      return kernel->getArgSize(argID);
    }
  
+  static uint8_t kernelGetArgBTI(gbe_kernel genKernel, uint32_t argID) {
+    if (genKernel == NULL) return 0u;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getArgBTI(argID);
+  }
+
    static uint32_t kernelGetArgAlign(gbe_kernel genKernel, uint32_t argID) {
      if (genKernel == NULL) return 0u;
      const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
@@ -1181,6 +1187,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_code_size_cb *gbe_kernel_get_code_size = NULL;
  GBE_EXPORT_SYMBOL gbe_kernel_get_arg_num_cb *gbe_kernel_get_arg_num = NULL;
  GBE_EXPORT_SYMBOL gbe_kernel_get_arg_info_cb *gbe_kernel_get_arg_info = NULL;
  GBE_EXPORT_SYMBOL gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_bti_cb *gbe_kernel_get_arg_bti = NULL;
  GBE_EXPORT_SYMBOL gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type = NULL;
  GBE_EXPORT_SYMBOL gbe_kernel_get_arg_align_cb *gbe_kernel_get_arg_align = NULL;
  GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL;
@@ -1227,6 +1234,7 @@ namespace gbe
        gbe_kernel_get_arg_num = gbe::kernelGetArgNum;
        gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
        gbe_kernel_get_arg_size = gbe::kernelGetArgSize;
+      gbe_kernel_get_arg_bti = gbe::kernelGetArgBTI;
        gbe_kernel_get_arg_type = gbe::kernelGetArgType;
        gbe_kernel_get_arg_align = gbe::kernelGetArgAlign;
        gbe_kernel_get_simd_width = gbe::kernelGetSIMDWidth;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h

index 2a3841a..330a3de 100644 (file)
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -62,6 +62,11 @@ enum gbe_get_arg_info_value {
    GBE_GET_ARG_INFO_INVALID = 0xffffffff
  };
  
+// BTI magic number
+#define BTI_CONSTANT 0
+#define BTI_PRIVATE 1
+#define BTI_RESERVED_NUM 2
+
  /*! Constant buffer values (ie values to setup in the constant buffer) */
  enum gbe_curbe_type {
    GBE_CURBE_LOCAL_ID_X = 0,
@@ -283,6 +288,10 @@ extern gbe_kernel_get_arg_info_cb *gbe_kernel_get_arg_info;
  typedef uint32_t (gbe_kernel_get_arg_size_cb)(gbe_kernel, uint32_t argID);
  extern gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size;
  
+/*! Get the the bti of a __global buffer */
+typedef uint8_t (gbe_kernel_get_arg_bti_cb)(gbe_kernel, uint32_t argID);
+extern gbe_kernel_get_arg_bti_cb *gbe_kernel_get_arg_bti;
+
  /*! Get the type of the given argument */
  typedef enum gbe_arg_type (gbe_kernel_get_arg_type_cb)(gbe_kernel, uint32_t argID);
  extern gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp

index b780c42..a6303b9 100644 (file)
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -49,7 +49,7 @@ namespace gbe {
      gbe_arg_type type; //!< Pointer, structure, image, regular value?
      uint32_t size;     //!< Size of the argument
      uint32_t align;    //!< addr alignment of the argument
-    uint32_t bufSize;  //!< Contant buffer size
+    uint8_t bti;      //!< binding table index for __global buffer
      ir::FunctionArgument::InfoFromLLVM info;
    };
  
@@ -91,6 +91,11 @@ namespace gbe {
      INLINE uint32_t getArgSize(uint32_t argID) const {
        return argID >= argNum ? 0u : args[argID].size;
      }
+    /*! Return the bti for __global buffer */
+    INLINE uint8_t getArgBTI(uint32_t argID) const {
+      return argID >= argNum ? 0u : args[argID].bti;
+    }
+    /*! Return the alignment of buffer argument */
      INLINE uint32_t getArgAlign(uint32_t argID) const {
        return argID >= argNum ? 0u : args[argID].align;
      }
diff --git a/backend/src/gbe_bin_interpreter.cpp b/backend/src/gbe_bin_interpreter.cpp

index bc032de..2f02b34 100644 (file)
--- a/backend/src/gbe_bin_interpreter.cpp
+++ b/backend/src/gbe_bin_interpreter.cpp
@@ -49,6 +49,7 @@ struct BinInterpCallBackInitializer
      gbe_kernel_get_name = gbe::kernelGetName;
      gbe_kernel_get_arg_type = gbe::kernelGetArgType;
      gbe_kernel_get_arg_size = gbe::kernelGetArgSize;
+    gbe_kernel_get_arg_bti = gbe::kernelGetArgBTI;
      gbe_kernel_get_simd_width = gbe::kernelGetSIMDWidth;
      gbe_kernel_get_scratch_size = gbe::kernelGetScratchSize;
      gbe_kernel_use_slm = gbe::kernelUseSLM;
diff --git a/backend/src/ir/context.cpp b/backend/src/ir/context.cpp

index bfbe825..1528a8d 100644 (file)
--- a/backend/src/ir/context.cpp
+++ b/backend/src/ir/context.cpp
@@ -108,10 +108,10 @@ namespace ir {
    }
  
    void Context::input(const std::string &name, FunctionArgument::Type type, Register reg,
-                      FunctionArgument::InfoFromLLVM& info, uint32_t elementSize, uint32_t align) {
+                      FunctionArgument::InfoFromLLVM& info, uint32_t elementSize, uint32_t align, unsigned char bti) {
      GBE_ASSERTM(fn != NULL, "No function currently defined");
      GBE_ASSERTM(reg < fn->file.regNum(), "Out-of-bound register");
-    FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name, align, info);
+    FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name, align, info, bti);
      fn->args.push_back(arg);
    }
  
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp

index 27ff4e9..8718a86 100644 (file)
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -110,7 +110,7 @@ namespace ir {
      LabelIndex label(void);
      /*! Append a new input register for the function */
      void input(const std::string &name, FunctionArgument::Type type, Register reg,
-               FunctionArgument::InfoFromLLVM& info, uint32_t elemSz = 0u, uint32_t align = 0);
+               FunctionArgument::InfoFromLLVM& info, uint32_t elemSz = 0u, uint32_t align = 0, uint8_t bti = 0);
      /*! Append a new output register for the function */
      void output(Register reg);
      /*! Get the immediate value */
@@ -163,22 +163,22 @@ namespace ir {
  
      /*! LOAD with the destinations directly specified */
      template <typename... Args>
-    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, Args...values)
+    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
      {
        const Tuple index = this->tuple(values...);
        const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
        GBE_ASSERT(valueNum > 0);
-      this->LOAD(type, index, offset, space, valueNum, dwAligned);
+      this->LOAD(type, index, offset, space, valueNum, dwAligned, bti);
      }
  
      /*! STORE with the sources directly specified */
      template <typename... Args>
-    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, Args...values)
+    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
      {
        const Tuple index = this->tuple(values...);
        const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
        GBE_ASSERT(valueNum > 0);
-      this->STORE(type, index, offset, space, valueNum, dwAligned);
+      this->STORE(type, index, offset, space, valueNum, dwAligned, bti);
      }
  
    protected:
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp

index 2710b17..deb7552 100644 (file)
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -176,8 +176,8 @@ namespace ir {
      };
  
      /*! Create a function input argument */
-    INLINE FunctionArgument(Type type, Register reg, uint32_t size, const std::string &name, uint32_t align, InfoFromLLVM& info) :
-      type(type), reg(reg), size(size), align(align), name(name), info(info) { }
+    INLINE FunctionArgument(Type type, Register reg, uint32_t size, const std::string &name, uint32_t align, InfoFromLLVM& info, uint8_t bti) :
+      type(type), reg(reg), size(size), align(align), name(name), info(info), bti(bti) { }
  
      Type type;     //!< Gives the type of argument we have
      Register reg;  //!< Holds the argument
@@ -185,6 +185,7 @@ namespace ir {
      uint32_t align; //!< address alignment for the argument
      const std::string name; //!< Holds the function name for IR output
      InfoFromLLVM info;  //!< Holds the llvm passed info
+    uint8_t bti; //!< binding table index
      GBE_STRUCT(FunctionArgument); // Use custom allocator
    };
  
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp

index ee80a3d..a9b1563 100644 (file)
--- a/backend/src/ir/image.cpp
+++ b/backend/src/ir/image.cpp
@@ -94,8 +94,9 @@ namespace ir {
    }
  
    void ImageSet::getData(struct ImageInfo *imageInfos) const {
+      int id = 0;
        for(auto &it : regMap)
-        imageInfos[it.second->idx - gbe_get_image_base_index()] = *it.second;
+        imageInfos[id++] = *it.second;
    }
  
    ImageSet::~ImageSet() {
@@ -186,7 +187,7 @@ namespace ir {
        IN_UPDATE_SZ(img_info->channelOrderSlot);
        IN_UPDATE_SZ(img_info->dimOrderSlot);
  
-      indexMap.insert(std::make_pair(index, img_info));
+      indexMap.insert(std::make_pair(img_info->idx, img_info));
      }
  
      IN_UPDATE_SZ(magic);
@@ -252,7 +253,7 @@ namespace ir {
      return reg;
    }
  
-  void ImageSet::append(Register imageReg, Context *ctx)
+  void ImageSet::append(Register imageReg, Context *ctx, uint8_t bti)
    {
      ir::FunctionArgument *arg =  ctx->getFunction().getArg(imageReg);
      GBE_ASSERTM(arg && arg->type == ir::FunctionArgument::IMAGE, "Append an invalid reg to image set.");
@@ -261,7 +262,7 @@ namespace ir {
      int32_t id = ctx->getFunction().getArgID(arg);
      struct ImageInfo *imageInfo = GBE_NEW(struct ImageInfo);
      imageInfo->arg_idx = id;
-    imageInfo->idx = regMap.size() + gbe_get_image_base_index();
+    imageInfo->idx = bti;
      imageInfo->wSlot = -1;
      imageInfo->hSlot = -1;
      imageInfo->depthSlot = -1;
diff --git a/backend/src/ir/image.hpp b/backend/src/ir/image.hpp

index 82fee56..b31c7da 100644 (file)
--- a/backend/src/ir/image.hpp
+++ b/backend/src/ir/image.hpp
@@ -44,7 +44,7 @@ namespace ir {
    {
    public:
      /*! Append an image argument. */
-    void append(Register imageReg, Context *ctx);
+    void append(Register imageReg, Context *ctx, uint8_t bti);
      /*! Append an image info slot. */
      void appendInfo(ImageInfoKey key, uint32_t offset);
      /*! Append an image info register. */
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp

index 3006893..23848d3 100644 (file)
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -314,6 +314,7 @@ namespace ir {
        AtomicInstruction(AtomicOps atomicOp,
                           Register dst,
                           AddressSpace addrSpace,
+                         BTI bti,
                           Tuple src)
        {
          this->opcode = OP_ATOMIC;
@@ -321,6 +322,7 @@ namespace ir {
          this->dst[0] = dst;
          this->src = src;
          this->addrSpace = addrSpace;
+        this->bti = bti;
          srcNum = 2;
          if((atomicOp == ATOMIC_OP_INC) ||
            (atomicOp == ATOMIC_OP_DEC))
@@ -329,12 +331,14 @@ namespace ir {
            srcNum = 3;
        }
        INLINE AddressSpace getAddressSpace(void) const { return this->addrSpace; }
+      INLINE BTI getBTI(void) const { return bti; }
        INLINE AtomicOps getAtomicOpcode(void) const { return this->atomicOp; }
        INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
        INLINE void out(std::ostream &out, const Function &fn) const;
        Register dst[1];
        Tuple src;
        AddressSpace addrSpace; //!< Address space
+      BTI bti;               //!< bti
        uint8_t srcNum:2;     //!<Source Number
        AtomicOps atomicOp:6;     //!<Source Number
      };
@@ -400,7 +404,8 @@ namespace ir {
                        Register offset,
                        AddressSpace addrSpace,
                        uint32_t valueNum,
-                      bool dwAligned)
+                      bool dwAligned,
+                      BTI bti)
        {
          GBE_ASSERT(valueNum < 128);
          this->opcode = OP_LOAD;
@@ -410,6 +415,7 @@ namespace ir {
          this->addrSpace = addrSpace;
          this->valueNum = valueNum;
          this->dwAligned = dwAligned ? 1 : 0;
+        this->bti = bti;
        }
        INLINE Register getDst(const Function &fn, uint32_t ID) const {
          GBE_ASSERTM(ID < valueNum, "Out-of-bound source register");
@@ -423,6 +429,7 @@ namespace ir {
        INLINE Type getValueType(void) const { return type; }
        INLINE uint32_t getValueNum(void) const { return valueNum; }
        INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
+      INLINE BTI getBTI(void) const { return bti; }
        INLINE bool wellFormed(const Function &fn, std::string &why) const;
        INLINE void out(std::ostream &out, const Function &fn) const;
        INLINE bool isAligned(void) const { return !!dwAligned; }
@@ -431,6 +438,7 @@ namespace ir {
        Register offset;        //!< Alias to make it similar to store
        Tuple values;           //!< Values to load
        AddressSpace addrSpace; //!< Where to load
+      BTI bti;
        uint8_t valueNum:7;     //!< Number of values to load
        uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
      };
@@ -444,7 +452,8 @@ namespace ir {
                         Register offset,
                         AddressSpace addrSpace,
                         uint32_t valueNum,
-                       bool dwAligned)
+                       bool dwAligned,
+                       BTI bti)
        {
          GBE_ASSERT(valueNum < 255);
          this->opcode = OP_STORE;
@@ -454,6 +463,7 @@ namespace ir {
          this->addrSpace = addrSpace;
          this->valueNum = valueNum;
          this->dwAligned = dwAligned ? 1 : 0;
+        this->bti = bti;
        }
        INLINE Register getSrc(const Function &fn, uint32_t ID) const {
          GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
@@ -473,6 +483,7 @@ namespace ir {
        INLINE uint32_t getValueNum(void) const { return valueNum; }
        INLINE Type getValueType(void) const { return type; }
        INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
+      INLINE BTI getBTI(void) const { return bti; }
        INLINE bool wellFormed(const Function &fn, std::string &why) const;
        INLINE void out(std::ostream &out, const Function &fn) const;
        INLINE bool isAligned(void) const { return !!dwAligned; }
@@ -480,6 +491,7 @@ namespace ir {
        Register offset;        //!< First source is the offset where to store
        Tuple values;           //!< Values to store
        AddressSpace addrSpace; //!< Where to store
+      BTI bti;                //!< Which btis need access
        uint8_t valueNum:7;     //!< Number of values to store
        uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
        Register dst[0];        //!< No destination
@@ -1069,6 +1081,9 @@ namespace ir {
        out << " {" << "%" << this->getSrc(fn, 0) << "}";
        for (uint32_t i = 1; i < srcNum; ++i)
          out << " %" << this->getSrc(fn, i);
+      out << " bti";
+      for (uint32_t i = 0; i < bti.count; ++i)
+        out << ": " << (int)bti.bti[i];
      }
  
  
@@ -1103,6 +1118,9 @@ namespace ir {
          out << "%" << this->getDst(fn, i) << (i != (valueNum-1u) ? " " : "");
        out << "}";
        out << " %" << this->getSrc(fn, 0);
+      out << " bti";
+      for (uint32_t i = 0; i < bti.count; ++i)
+        out << ": " << (int)bti.bti[i];
      }
  
      INLINE void StoreInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1112,6 +1130,9 @@ namespace ir {
        for (uint32_t i = 0; i < valueNum; ++i)
          out << "%" << this->getSrc(fn, i+1) << (i != (valueNum-1u) ? " " : "");
        out << "}";
+      out << " bti";
+      for (uint32_t i = 0; i < bti.count; ++i)
+        out << ": " << (int)bti.bti[i];
      }
  
      INLINE void LabelInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1183,7 +1204,7 @@ namespace ir {
    return HelperIntrospection<CLASS, RefClass>::value == 1;
  
  #define START_INTROSPECTION(CLASS) \
-  static_assert(sizeof(internal::CLASS) == sizeof(uint64_t), \
+  static_assert(sizeof(internal::CLASS) == (sizeof(uint64_t)*2), \
                  "Bad instruction size"); \
    static_assert(offsetof(internal::CLASS, opcode) == 0, \
                  "Bad opcode offset"); \
@@ -1427,14 +1448,17 @@ DECL_MEM_FN(BitCastInstruction, Type, getDstType(void), getDstType())
  DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
  DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
  DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(AtomicInstruction, BTI, getBTI(void), getBTI())
  DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode())
  DECL_MEM_FN(StoreInstruction, Type, getValueType(void), getValueType())
  DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
  DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(StoreInstruction, BTI, getBTI(void), getBTI())
  DECL_MEM_FN(StoreInstruction, bool, isAligned(void), isAligned())
  DECL_MEM_FN(LoadInstruction, Type, getValueType(void), getValueType())
  DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
  DECL_MEM_FN(LoadInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(LoadInstruction, BTI, getBTI(void), getBTI())
  DECL_MEM_FN(LoadInstruction, bool, isAligned(void), isAligned())
  DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
  DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
@@ -1578,8 +1602,8 @@ DECL_MEM_FN(GetImageInfoInstruction, const uint8_t, getImageIndex(void), getImag
    }
  
    // For all unary functions with given opcode
-  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, Tuple src) {
-    return internal::AtomicInstruction(atomicOp, dst, space, src).convert();
+  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, BTI bti, Tuple src) {
+    return internal::AtomicInstruction(atomicOp, dst, space, bti, src).convert();
    }
  
    // BRA
@@ -1621,9 +1645,10 @@ DECL_MEM_FN(GetImageInfoInstruction, const uint8_t, getImageIndex(void), getImag
                     Register offset, \
                     AddressSpace space, \
                     uint32_t valueNum, \
-                   bool dwAligned) \
+                   bool dwAligned, \
+                   BTI bti) \
    { \
-    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned).convert(); \
+    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,bti).convert(); \
    }
  
    DECL_EMIT_FUNCTION(LOAD, LoadInstruction)
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp

index ada780f..e4c415e 100644 (file)
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -31,9 +31,14 @@
  #include "sys/intrusive_list.hpp"
  
  #include <ostream>
+#define MAX_MIXED_POINTER 4
  
  namespace gbe {
  namespace ir {
+  struct BTI {
+    uint8_t bti[MAX_MIXED_POINTER];
+    uint8_t count;
+  };
  
    /*! All opcodes */
    enum Opcode : uint8_t {
@@ -95,7 +100,7 @@ namespace ir {
    ///////////////////////////////////////////////////////////////////////////
  
    /*! Stores instruction internal data and opcode */
-  class ALIGNED(sizeof(uint64_t)) InstructionBase
+  class ALIGNED(sizeof(uint64_t)*2) InstructionBase
    {
    public:
      /*! Initialize the instruction from a 8 bytes stream */
@@ -109,7 +114,7 @@ namespace ir {
      /*! Get the instruction opcode */
      INLINE Opcode getOpcode(void) const { return opcode; }
    protected:
-    enum { opaqueSize = sizeof(uint64_t)-sizeof(uint8_t) };
+    enum { opaqueSize = sizeof(uint64_t)*2-sizeof(uint8_t) };
      Opcode opcode;               //!< Idendifies the instruction
      char opaque[opaqueSize];     //!< Remainder of it
      GBE_CLASS(InstructionBase);  //!< Use internal allocators
@@ -273,6 +278,7 @@ namespace ir {
      static const uint32_t addressIndex = 0;
      /*! Address space that is manipulated here */
      AddressSpace getAddressSpace(void) const;
+    BTI getBTI(void) const;
      /*! Return the atomic function code */
      AtomicOps getAtomicOpcode(void) const;
      /*! Return the register that contains the addresses */
@@ -292,6 +298,7 @@ namespace ir {
      Type getValueType(void) const;
      /*! Give the number of values the instruction is storing (srcNum-1) */
      uint32_t getValueNum(void) const;
+    BTI getBTI(void) const;
      /*! Address space that is manipulated here */
      AddressSpace getAddressSpace(void) const;
      /*! DWORD aligned means untyped read for Gen. That is what matters */
@@ -323,6 +330,7 @@ namespace ir {
      bool isAligned(void) const;
      /*! Return the register that contains the addresses */
      INLINE Register getAddress(void) const { return this->getSrc(0u); }
+    BTI getBTI(void) const;
      /*! Return the register that contain value valueID */
      INLINE Register getValue(uint32_t valueID) const {
        return this->getDst(valueID);
@@ -644,7 +652,7 @@ namespace ir {
    /*! F32TO16.{dstType <- srcType} dst src */
    Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src);
    /*! atomic dst addr.space {src1 {src2}} */
-  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, Tuple src);
+  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, BTI bti, Tuple src);
    /*! bra labelIndex */
    Instruction BRA(LabelIndex labelIndex);
    /*! (pred) bra labelIndex */
@@ -658,9 +666,9 @@ namespace ir {
    /*! ret */
    Instruction RET(void);
    /*! load.type.space {dst1,...,dst_valueNum} offset value */
-  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned);
+  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
    /*! store.type.space offset {src1,...,src_valueNum} value */
-  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned);
+  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
    /*! loadi.type dst value */
    Instruction LOADI(Type type, Register dst, ImmediateIndex value);
    /*! sync.params... (see Sync instruction) */
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp

index e6aff27..2ab12a4 100644 (file)
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -158,6 +158,7 @@
  #include "ir/value.hpp"
  #include "sys/set.hpp"
  #include "sys/cvar.hpp"
+#include "backend/program.h"
  
  /* Not defined for LLVM 3.0 */
  #if !defined(LLVM_VERSION_MAJOR)
@@ -449,6 +450,7 @@ namespace gbe
       *  compare instructions we need to invert to decrease branch complexity
       */
      set<const Value*> conditionSet;
+    map<const Value*, int> globalPointer;
      /*!
       *  <phi,phiCopy> node information for later optimization
       */
@@ -463,7 +465,7 @@ namespace gbe
  
      LoopInfo *LI;
      const Module *TheModule;
-
+    int btiBase;
    public:
      static char ID;
      explicit GenWriter(ir::Unit &unit)
@@ -472,7 +474,8 @@ namespace gbe
          ctx(unit),
          regTranslator(ctx),
          LI(0),
-        TheModule(0)
+        TheModule(0),
+        btiBase(BTI_RESERVED_NUM)
      {
        initializeLoopInfoPass(*PassRegistry::getPassRegistry());
        pass = PASS_EMIT_REGISTERS;
@@ -503,6 +506,9 @@ namespace gbe
        LI = &getAnalysis<LoopInfo>();
        emitFunction(F);
        phiMap.clear();
+      globalPointer.clear();
+      // Reset for next function
+      btiBase = BTI_RESERVED_NUM;
        return false;
      }
  
@@ -594,10 +600,12 @@ namespace gbe
      void visitInsertValueInst(InsertValueInst &I) {NOT_SUPPORTED;}
      void visitExtractValueInst(ExtractValueInst &I) {NOT_SUPPORTED;}
      template <bool isLoad, typename T> void visitLoadOrStore(T &I);
+
+    INLINE void gatherBTI(Value *pointer, ir::BTI &bti);
      // batch vec4/8/16 load/store
      INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
                    Value *llvmValue, const ir::Register ptr,
-                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad);
+                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti);
      void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
    };
  
@@ -1193,7 +1201,7 @@ namespace gbe
            const uint32_t elemSize = getTypeByteSize(unit, elemType);
            const uint32_t elemNum = vectorType->getNumElements();
            //vector's elemType always scalar type
-          ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, elemNum*elemSize, getAlignmentByte(unit, type));
+          ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, elemNum*elemSize, getAlignmentByte(unit, type), 0);
  
            ir::Function& fn = ctx.getFunction();
            for(uint32_t i=1; i < elemNum; i++) {
@@ -1208,7 +1216,7 @@ namespace gbe
                      "vector type in the function argument is not supported yet");
          const ir::Register reg = getRegister(I);
          if (type->isPointerTy() == false)
-          ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type));
+          ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
          else {
            PointerType *pointerType = dyn_cast<PointerType>(type);
            Type *pointed = pointerType->getElementType();
@@ -1219,7 +1227,7 @@ namespace gbe
            if (I->hasByValAttr()) {
  #endif /* LLVM_VERSION_MINOR <= 1 */
              const size_t structSize = getTypeByteSize(unit, pointed);
-            ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, llvmInfo, structSize, getAlignmentByte(unit, type));
+            ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, llvmInfo, structSize, getAlignmentByte(unit, type), 0);
            }
            // Regular user provided pointer (global, local or constant)
            else {
@@ -1229,18 +1237,20 @@ namespace gbe
              const uint32_t align = getAlignmentByte(unit, pointed);
                switch (addrSpace) {
                case ir::MEM_GLOBAL:
-                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align);
+                globalPointer.insert(std::make_pair(I, btiBase));
+                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, btiBase);
+                btiBase++;
                break;
                case ir::MEM_LOCAL:
-                ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg,  llvmInfo, ptrSize, align);
+                ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg,  llvmInfo, ptrSize, align, 0xfe);
                  ctx.getFunction().setUseSLM(true);
                break;
                case ir::MEM_CONSTANT:
-                ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg,  llvmInfo, ptrSize, align);
+                ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg,  llvmInfo, ptrSize, align, 0x2);
                break;
                case ir::IMAGE:
-                ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, ptrSize, align);
-                ctx.getFunction().getImageSet()->append(reg, &ctx);
+                ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, ptrSize, align, 0x0);
+                ctx.getFunction().getImageSet()->append(reg, &ctx, btiBase++);
                break;
                default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE);
              }
@@ -2489,6 +2499,8 @@ namespace gbe
      const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
      const ir::Register dst = this->getRegister(&I);
  
+    ir::BTI bti;
+    gatherBTI(*AI, bti);
      vector<ir::Register> src;
      uint32_t srcNum = 0;
      while(AI != AE) {
@@ -2496,7 +2508,7 @@ namespace gbe
        srcNum++;
      }
      const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
-    ctx.ATOMIC(opcode, dst, addrSpace, srcTuple);
+    ctx.ATOMIC(opcode, dst, addrSpace, bti, srcTuple);
    }
  
    /* append a new sampler. should be called before any reference to
@@ -3170,7 +3182,7 @@ handle_write_image:
    void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
                                        Value *llvmValues, const ir::Register ptr,
                                        const ir::AddressSpace addrSpace,
-                                      Type * elemType, bool isLoad) {
+                                      Type * elemType, bool isLoad, ir::BTI bti) {
      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
      uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
      uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
@@ -3216,12 +3228,107 @@ handle_write_image:
  
        // Emit the instruction
        if (isLoad)
-        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true);
+        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true, bti);
        else
-        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true);
+        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true, bti);
      }
    }
  
+  // The idea behind is to search along the use-def chain, and find out all
+  // possible source of the pointer. Then in later codeGen, we can emit
+  // read/store instructions to these btis gathered.
+  void GenWriter::gatherBTI(Value *pointer, ir::BTI &bti) {
+    typedef map<const Value*, int>::iterator GlobalPtrIter;
+    Value *p;
+    size_t idx = 0;
+    int nBTI = 0;
+    std::vector<Value*> candidates;
+    candidates.push_back(pointer);
+    std::set<Value*> processed;
+    bool needNewBTI = true;
+
+    while (idx < candidates.size()) {
+      bool isPrivate = false;
+      p = candidates[idx];
+
+      while (dyn_cast<User>(p)) {
+
+        if (processed.find(p) == processed.end()) {
+          processed.insert(p);
+        } else {
+          // This use-def chain falls into a loop,
+          // it does not introduce a new buffer source.
+          needNewBTI = false;
+          break;
+        }
+
+        if (dyn_cast<SelectInst>(p)) {
+          SelectInst *sel = cast<SelectInst>(p);
+          p = sel->getTrueValue();
+          candidates.push_back(sel->getFalseValue());
+          continue;
+        }
+
+        if (dyn_cast<PHINode>(p)) {
+          PHINode* phi = cast<PHINode>(p);
+          int n = phi->getNumIncomingValues();
+          for (int j = 1; j < n; j++)
+            candidates.push_back(phi->getIncomingValue(j));
+          p = phi->getIncomingValue(0);
+          continue;
+        }
+
+        if (dyn_cast<AllocaInst>(p)) {
+          isPrivate = true;
+          break;
+        }
+        p = cast<User>(p)->getOperand(0);
+      }
+
+      if (needNewBTI == false) {
+        // go to next possible pointer source
+        idx++; continue;
+      }
+
+      uint8_t new_bti = 0;
+      if (isPrivate) {
+        new_bti = BTI_PRIVATE;
+      } else {
+        if(isa<Argument>(p) && dyn_cast<Argument>(p)->hasByValAttr()) {
+          // structure value implementation is not complete now,
+          // they are now treated as push constant, so, the load/store
+          // here is not as meaningful.
+          bti.bti[0] = BTI_PRIVATE;
+          bti.count = 1;
+          break;
+        }
+        Type *ty = p->getType();
+        if(ty->getPointerAddressSpace() == 3) {
+          // __local memory
+          new_bti = 0xfe;
+        } else {
+          // __global memory
+          GlobalPtrIter iter = globalPointer.find(p);
+          GBE_ASSERT(iter != globalPointer.end());
+          new_bti = iter->second;
+        }
+      }
+      // avoid duplicate
+      bool bFound = false;
+      for (int j = 0; j < nBTI; j++) {
+        if (bti.bti[j] == new_bti) {
+          bFound = true; break;
+        }
+      }
+      if (bFound == false) {
+        bti.bti[nBTI++] = new_bti;
+        bti.count = nBTI;
+      }
+      idx++;
+    }
+    GBE_ASSERT(bti.count <= MAX_MIXED_POINTER);
+  }
+
    extern int OCL_SIMD_WIDTH;
    template <bool isLoad, typename T>
    INLINE void GenWriter::emitLoadOrStore(T &I)
@@ -3233,15 +3340,18 @@ handle_write_image:
      const bool dwAligned = (I.getAlignment() % 4) == 0;
      const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
      const ir::Register ptr = this->getRegister(llvmPtr);
-
+    ir::BTI binding;
+    if(addrSpace == ir::MEM_GLOBAL || addrSpace == ir::MEM_PRIVATE) {
+      gatherBTI(llvmPtr, binding);
+    }
      // Scalar is easy. We neednot build register tuples
      if (isScalarType(llvmType) == true) {
        const ir::Type type = getType(ctx, llvmType);
        const ir::Register values = this->getRegister(llvmValues);
        if (isLoad)
-        ctx.LOAD(type, ptr, addrSpace, dwAligned, values);
+        ctx.LOAD(type, ptr, addrSpace, dwAligned, binding, values);
        else
-        ctx.STORE(type, ptr, addrSpace, dwAligned, values);
+        ctx.STORE(type, ptr, addrSpace, dwAligned, binding, values);
      }
      // A vector type requires to build a tuple
      else {
@@ -3285,18 +3395,18 @@ handle_write_image:
  
            // Emit the instruction
            if (isLoad)
-            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned);
+            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
            else
-            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned);
+            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
          }
          // Not supported by the hardware. So, we split the message and we use
          // strided loads and stores
          else {
-          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad);
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
          }
        }
        else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) {
-          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad);
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
        } else {
          for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
            if(regTranslator.isUndefConst(llvmValues, elemID))
@@ -3316,9 +3426,9 @@ handle_write_image:
                ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
            }
            if (isLoad)
-           ctx.LOAD(type, addr, addrSpace, dwAligned, reg);
+           ctx.LOAD(type, addr, addrSpace, dwAligned, binding, reg);
            else
-           ctx.STORE(type, addr, addrSpace, dwAligned, reg);
+           ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
          }
        }
      }
diff --git a/kernels/compiler_mixed_pointer.cl b/kernels/compiler_mixed_pointer.cl

new file mode 100644 (file)

index 0000000..78c5783
--- /dev/null
+++ b/kernels/compiler_mixed_pointer.cl
@@ -0,0 +1,23 @@
+
+kernel void compiler_mixed_pointer(__global uint* src1, __global uint *src2, __global uint *dst) {
+  int x = get_global_id(0);
+  global uint * tmp = NULL;
+
+  switch(x) {
+    case 0:
+    case 1:
+    case 4:
+      tmp = src1;
+      break;
+    default:
+      tmp = src2;
+      break;
+  }
+  dst[x] = tmp[x];
+}
+
+kernel void compiler_mixed_pointer1(__global uint* src, __global uint *dst1, __global uint *dst2) {
+  int x = get_global_id(0);
+  global uint * tmp = x < 5 ? dst1 : dst2;
+  tmp[x] = src[x];
+}
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c

index 842c864..05be801 100644 (file)
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -167,9 +167,9 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
      offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
      if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
        struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
-      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cl_gpgpu_get_cache_ctrl());
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
      } else {
-      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cl_gpgpu_get_cache_ctrl());
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
      }
    }
  
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c

index 58ecc5a..a4c8af7 100644 (file)
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -127,7 +127,7 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
    if(raw_size == 0)
       return 0;
  
-  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, aligned_size);
+  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, aligned_size, BTI_CONSTANT);
    if (bo == NULL)
      return -1;
    cl_buffer_map(bo, 1);
@@ -255,8 +255,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
     */
    if(cl_driver_get_ver(ctx->drv) == 75)
      stack_sz *= 4;
-
-  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cl_gpgpu_get_cache_ctrl());
+  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE);
  }
  
  static int
diff --git a/src/cl_driver.h b/src/cl_driver.h

index 374813a..461c11e 100644 (file)
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -113,7 +113,7 @@ typedef void (cl_gpgpu_sync_cb)(void*);
  extern cl_gpgpu_sync_cb *cl_gpgpu_sync;
  
  /* Bind a regular unformatted buffer */
-typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t internal_offset, uint32_t cchint);
+typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t internal_offset, uint32_t size, uint8_t bti);
  extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
  
  /* bind samplers defined in both kernel and kernel args. */
@@ -158,7 +158,7 @@ extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters;
  typedef int (cl_gpgpu_upload_curbes_cb)(cl_gpgpu, const void* data, uint32_t size);
  extern cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes;
  
-typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t size);
+typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t size, uint8_t bti);
  extern cl_gpgpu_alloc_constant_buffer_cb *cl_gpgpu_alloc_constant_buffer;
  
  /* Setup all indirect states */
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp

index c2a61c4..5f2f9ce 100644 (file)
--- a/src/cl_gbe_loader.cpp
+++ b/src/cl_gbe_loader.cpp
@@ -47,6 +47,7 @@ gbe_kernel_get_code_cb *interp_kernel_get_code = NULL;
  gbe_kernel_get_code_size_cb *interp_kernel_get_code_size = NULL;
  gbe_kernel_get_arg_num_cb *interp_kernel_get_arg_num = NULL;
  gbe_kernel_get_arg_size_cb *interp_kernel_get_arg_size = NULL;
+gbe_kernel_get_arg_bti_cb *interp_kernel_get_arg_bti = NULL;
  gbe_kernel_get_arg_type_cb *interp_kernel_get_arg_type = NULL;
  gbe_kernel_get_arg_align_cb *interp_kernel_get_arg_align = NULL;
  gbe_kernel_get_simd_width_cb *interp_kernel_get_simd_width = NULL;
@@ -143,6 +144,10 @@ struct GbeLoaderInitializer
      if (interp_kernel_get_arg_size == NULL)
        return false;
  
+    interp_kernel_get_arg_bti = *(gbe_kernel_get_arg_bti_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_bti");
+    if (interp_kernel_get_arg_bti == NULL)
+      return false;
+
      interp_kernel_get_arg_type = *(gbe_kernel_get_arg_type_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_type");
      if (interp_kernel_get_arg_type == NULL)
        return false;
diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h

index f855390..d095240 100644 (file)
--- a/src/cl_gbe_loader.h
+++ b/src/cl_gbe_loader.h
@@ -47,6 +47,7 @@ extern gbe_kernel_get_code_cb *interp_kernel_get_code;
  extern gbe_kernel_get_code_size_cb *interp_kernel_get_code_size;
  extern gbe_kernel_get_arg_num_cb *interp_kernel_get_arg_num;
  extern gbe_kernel_get_arg_size_cb *interp_kernel_get_arg_size;
+extern gbe_kernel_get_arg_bti_cb *interp_kernel_get_arg_bti;
  extern gbe_kernel_get_arg_type_cb *interp_kernel_get_arg_type;
  extern gbe_kernel_get_arg_align_cb *interp_kernel_get_arg_align;
  extern gbe_kernel_get_simd_width_cb *interp_kernel_get_simd_width;
diff --git a/src/cl_kernel.c b/src/cl_kernel.c

index aad3c04..94b834b 100644 (file)
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -204,7 +204,7 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
    k->args[index].mem = mem;
    k->args[index].is_set = 1;
    k->args[index].local_sz = 0;
-
+  k->args[index].bti = interp_kernel_get_arg_bti(k->opaque, index);
    return CL_SUCCESS;
  }
  
diff --git a/src/cl_kernel.h b/src/cl_kernel.h

index 09362b3..f4ed8d3 100644 (file)
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -37,6 +37,7 @@ struct _gbe_kernel;
  typedef struct cl_argument {
    cl_mem mem;           /* For image and regular buffers */
    cl_sampler sampler;   /* For sampler. */
+  unsigned char bti;
    uint32_t local_sz:31; /* For __local size specification */
    uint32_t is_set:1;    /* All args must be set before NDRange */
  } cl_argument;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c

index 7986ab1..e0c00ea 100644 (file)
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -91,7 +91,6 @@ struct intel_gpgpu
  
    unsigned long img_bitmap;              /* image usage bitmap. */
    unsigned int img_index_base;          /* base index for image surface.*/
-  drm_intel_bo *binded_img[max_img_n + 128];  /* all images binded for the call */
  
    unsigned long sampler_bitmap;          /* sampler usage bitmap. */
  
@@ -690,13 +689,13 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_
  }
  
  static dri_bo*
-intel_gpgpu_alloc_constant_buffer_gen7(intel_gpgpu_t *gpgpu, uint32_t size)
+intel_gpgpu_alloc_constant_buffer_gen7(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
  {
    uint32_t s = size - 1;
    assert(size != 0);
  
    surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
-  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[2];
+  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[bti];
    memset(ss2, 0, sizeof(gen7_surface_state_t));
    ss2->ss0.surface_type = I965_SURFACE_BUFFER;
    ss2->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_UINT;
@@ -704,7 +703,7 @@ intel_gpgpu_alloc_constant_buffer_gen7(intel_gpgpu_t *gpgpu, uint32_t size)
    ss2->ss2.height = (s >> 7) & 0x3fff;   /* bits 20:7 of sz */
    ss2->ss3.depth  = (s >> 21) & 0x3ff;   /* bits 30:21 of sz */
    ss2->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
-  heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t);
+  heap->binding_table[bti] = offsetof(surface_heap_t, surface) + bti* sizeof(gen7_surface_state_t);
  
    if(gpgpu->constant_b.bo)
      dri_bo_unreference(gpgpu->constant_b.bo);
@@ -717,20 +716,20 @@ intel_gpgpu_alloc_constant_buffer_gen7(intel_gpgpu_t *gpgpu, uint32_t size)
                        I915_GEM_DOMAIN_RENDER,
                        0,
                        gpgpu->aux_offset.surface_heap_offset +
-                      heap->binding_table[2] +
+                      heap->binding_table[bti] +
                        offsetof(gen7_surface_state_t, ss1),
                        gpgpu->constant_b.bo);
    return gpgpu->constant_b.bo;
  }
  
  static dri_bo*
-intel_gpgpu_alloc_constant_buffer_gen75(intel_gpgpu_t *gpgpu, uint32_t size)
+intel_gpgpu_alloc_constant_buffer_gen75(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
  {
    uint32_t s = size - 1;
    assert(size != 0);
  
    surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
-  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[2];
+  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[bti];
    memset(ss2, 0, sizeof(gen7_surface_state_t));
    ss2->ss0.surface_type = I965_SURFACE_BUFFER;
    ss2->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_UINT;
@@ -742,7 +741,7 @@ intel_gpgpu_alloc_constant_buffer_gen75(intel_gpgpu_t *gpgpu, uint32_t size)
    ss2->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
    ss2->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
    ss2->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA;
-  heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t);
+  heap->binding_table[bti] = offsetof(surface_heap_t, surface) + bti* sizeof(gen7_surface_state_t);
  
    if(gpgpu->constant_b.bo)
      dri_bo_unreference(gpgpu->constant_b.bo);
@@ -755,36 +754,39 @@ intel_gpgpu_alloc_constant_buffer_gen75(intel_gpgpu_t *gpgpu, uint32_t size)
                        I915_GEM_DOMAIN_RENDER,
                        0,
                        gpgpu->aux_offset.surface_heap_offset +
-                      heap->binding_table[2] +
+                      heap->binding_table[bti] +
                        offsetof(gen7_surface_state_t, ss1),
                        gpgpu->constant_b.bo);
    return gpgpu->constant_b.bo;
  }
  
-
-/* Map address space with two 2GB surfaces. One surface for untyped message and
- * one surface for byte scatters / gathers. Actually the HW does not require two
- * surfaces but Fulsim complains
- */
  static void
-intel_gpgpu_map_address_space(intel_gpgpu_t *gpgpu)
+intel_gpgpu_setup_bti(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset, uint32_t size, unsigned char index)
  {
+  uint32_t s = size - 1;
    surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
-  gen7_surface_state_t *ss0 = (gen7_surface_state_t *) heap->surface[0];
-  gen7_surface_state_t *ss1 = (gen7_surface_state_t *) heap->surface[1];
+  gen7_surface_state_t *ss0 = (gen7_surface_state_t *) heap->surface[index];
    memset(ss0, 0, sizeof(gen7_surface_state_t));
-  memset(ss1, 0, sizeof(gen7_surface_state_t));
-  ss1->ss0.surface_type = ss0->ss0.surface_type = I965_SURFACE_BUFFER;
-  ss1->ss0.surface_format = ss0->ss0.surface_format = I965_SURFACEFORMAT_RAW;
-  ss1->ss2.width  = ss0->ss2.width  = 127;   /* bits 6:0 of sz */
-  ss1->ss2.height = ss0->ss2.height = 16383; /* bits 20:7 of sz */
-  ss0->ss3.depth  = 1023; /* bits 30:21 of sz */
-  ss1->ss3.depth  = 1023;  /* bits 30:21 of sz */
-  ss1->ss5.cache_control = ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
-  heap->binding_table[0] = offsetof(surface_heap_t, surface);
-  heap->binding_table[1] = sizeof(gen7_surface_state_t) + offsetof(surface_heap_t, surface);
+  ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss0->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+  ss0->ss2.width  = s & 0x7f;   /* bits 6:0 of sz */
+  ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+  ss0->ss3.depth  = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
+  ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+  heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen7_surface_state_t);
+
+  ss0->ss1.base_addr = buf->offset + internal_offset;
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                      I915_GEM_DOMAIN_RENDER,
+                      I915_GEM_DOMAIN_RENDER,
+                      internal_offset,
+                      gpgpu->aux_offset.surface_heap_offset +
+                      heap->binding_table[index] +
+                      offsetof(gen7_surface_state_t, ss1),
+                      buf);
  }
  
+
  static int
  intel_is_surface_array(cl_mem_object_type type)
  {
@@ -863,7 +865,6 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
    }
    ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
    intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
-  gpgpu->binded_img[index - gpgpu->img_index_base] = obj_bo;
  
    assert(index < GEN_MAX_SURFACES);
  }
@@ -884,7 +885,6 @@ intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
    surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
    gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
    memset(ss, 0, sizeof(*ss));
-
    ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
    if (index > 128 + 2 && type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
      ss->ss0.surface_type = I965_SURFACE_2D;
@@ -916,20 +916,20 @@ intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
    }
    ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
    intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
-  gpgpu->binded_img[index - gpgpu->img_index_base] = obj_bo;
  
    assert(index < GEN_MAX_SURFACES);
  }
  
  static void
  intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
-                     uint32_t internal_offset, uint32_t cchint)
+                     uint32_t internal_offset, uint32_t size, uint8_t bti)
  {
    assert(gpgpu->binded_n < max_buf_n);
    gpgpu->binded_buf[gpgpu->binded_n] = buf;
    gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset;
    gpgpu->binded_offset[gpgpu->binded_n] = offset;
    gpgpu->binded_n++;
+  intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti);
  }
  
  static int
@@ -957,11 +957,12 @@ intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
    return 0;
  }
  static void
-intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint32_t cchint)
+intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint8_t bti)
  {
    drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
    gpgpu->stack_b.bo = drm_intel_bo_alloc(bufmgr, "STACK", size, 64);
-  intel_gpgpu_bind_buf(gpgpu, gpgpu->stack_b.bo, offset, 0, cchint);
+
+  intel_gpgpu_bind_buf(gpgpu, gpgpu->stack_b.bo, offset, 0, size, bti);
  }
  
  static void
@@ -1155,7 +1156,6 @@ intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
  {
    gpgpu->ker = kernel;
    intel_gpgpu_build_idrt(gpgpu, kernel);
-  intel_gpgpu_map_address_space(gpgpu);
    dri_bo_unmap(gpgpu->aux_buf.bo);
  }
  
@@ -1378,8 +1378,7 @@ intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i, uint32_t size, uint
    }
    memset(bo->virtual, 0, size);
    drm_intel_bo_unmap(bo);
-
-  intel_gpgpu_bind_buf(gpgpu, bo, offset, 0, 0);
+  intel_gpgpu_bind_buf(gpgpu, bo, offset, 0, size, 0);
    return 0;
  }
  
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt

index 561744d..c6dea8c 100644 (file)
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -31,6 +31,7 @@ set (utests_sources
    compiler_insert_to_constant.cpp
    compiler_argument_structure.cpp
    compiler_arith_shift_right.cpp
+  compiler_mixed_pointer.cpp
    compiler_array0.cpp
    compiler_array.cpp
    compiler_array1.cpp
diff --git a/utests/compiler_mixed_pointer.cpp b/utests/compiler_mixed_pointer.cpp

new file mode 100644 (file)

index 0000000..9531fb2
--- /dev/null
+++ b/utests/compiler_mixed_pointer.cpp
@@ -0,0 +1,119 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src1, int *src2, int *dst) {
+  int * tmp = NULL;
+
+  switch(global_id) {
+    case 0:
+    case 1:
+    case 4:
+      tmp = src1;
+      break;
+    default:
+      tmp = src2;
+      break;
+  }
+  dst[global_id] = tmp[global_id];
+
+}
+static void cpu1(int global_id, int *src, int *dst1, int *dst2) {
+  int * tmp = global_id < 5 ? dst1 : dst2;
+  tmp[global_id] = src[global_id];
+}
+
+void compiler_mixed_pointer(void)
+{
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16], cpu_src1[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_mixed_pointer");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = i;
+      cpu_src1[i] = ((int32_t*)buf_data[1])[i] = 65536-i;
+    }
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_src1, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(2);
+    for (size_t i = 0; i < n; ++i) {
+//      printf(" %d  %d\n", cpu_dst[i], ((int32_t*)buf_data[2])[i]);
+      OCL_ASSERT(((int32_t*)buf_data[2])[i] == cpu_dst[i]);
+    }
+    OCL_UNMAP_BUFFER(2);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mixed_pointer);
+
+void compiler_mixed_pointer1(void)
+{
+  const size_t n = 16;
+  int cpu_dst1[16], cpu_dst2[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_mixed_pointer", "compiler_mixed_pointer1");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+    OCL_MAP_BUFFER(2);
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = i;
+      cpu_dst1[i] = ((int32_t*)buf_data[1])[i] = 0xff;
+      cpu_dst2[i] = ((int32_t*)buf_data[2])[i] = 0xff;
+    }
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+    OCL_UNMAP_BUFFER(2);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu1(i, cpu_src, cpu_dst1, cpu_dst2);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    OCL_MAP_BUFFER(2);
+    for (size_t i = 0; i < n; ++i) {
+//      printf(" %d  %d\n", cpu_dst1[i], ((int32_t*)buf_data[1])[i]);
+//      printf(" %d  %d\n", ((int32_t*)buf_data[2])[i], cpu_dst2[i]);
+      OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst1[i]);
+      OCL_ASSERT(((int32_t*)buf_data[2])[i] == cpu_dst2[i]);
+    }
+    OCL_UNMAP_BUFFER(1);
+    OCL_UNMAP_BUFFER(2);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mixed_pointer1);
author	Ruiling Song <ruiling.song@intel.com>
	Wed, 30 Jul 2014 05:59:29 +0000 (13:59 +0800)
committer	Zhigang Gong <zhigang.gong@intel.com>
	Wed, 30 Jul 2014 07:23:43 +0000 (15:23 +0800)
backend/src/backend/context.cpp		patch \| blob \| history
backend/src/backend/context.hpp		patch \| blob \| history
backend/src/backend/gen_insn_selection.cpp		patch \| blob \| history
backend/src/backend/program.cpp		patch \| blob \| history
backend/src/backend/program.h		patch \| blob \| history
backend/src/backend/program.hpp		patch \| blob \| history
backend/src/gbe_bin_interpreter.cpp		patch \| blob \| history
backend/src/ir/context.cpp		patch \| blob \| history
backend/src/ir/context.hpp		patch \| blob \| history
backend/src/ir/function.hpp		patch \| blob \| history
backend/src/ir/image.cpp		patch \| blob \| history
backend/src/ir/image.hpp		patch \| blob \| history
backend/src/ir/instruction.cpp		patch \| blob \| history
backend/src/ir/instruction.hpp		patch \| blob \| history
backend/src/llvm/llvm_gen_backend.cpp		patch \| blob \| history
kernels/compiler_mixed_pointer.cl	[new file with mode: 0644]	patch \| blob
src/cl_command_queue.c		patch \| blob \| history
src/cl_command_queue_gen7.c		patch \| blob \| history
src/cl_driver.h		patch \| blob \| history
src/cl_gbe_loader.cpp		patch \| blob \| history
src/cl_gbe_loader.h		patch \| blob \| history
src/cl_kernel.c		patch \| blob \| history
src/cl_kernel.h		patch \| blob \| history
src/intel/intel_gpgpu.c		patch \| blob \| history
utests/CMakeLists.txt		patch \| blob \| history
utests/compiler_mixed_pointer.cpp	[new file with mode: 0644]	patch \| blob