From: Zhigang Gong <zhigang.gong@linux.intel.com>
Date: Wed, 25 Sep 2013 10:26:49 +0000 (+0800)
Subject: GBE/Runtime: implement workaround for IVB sampler bug
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b835433189a3dca202e7fc7d9ff0bfbc49676281;p=contrib%2Fbeignet.git

GBE/Runtime: implement workaround for IVB sampler bug

Per IVB spec,

If the surface format of the associated surface is UINT or SINT,
the Surface Type cannot be SURFTYPE_3D or SURFTYPE_CUBE and Address
Control Mode cannot be CLAMP_BORDER or HALF_BORDER.

Besides this bug, there is another undocumented issue. If a surface
data type is IEEE float. Then when we use sampler to sample the pixel,
if the value is betweeo -1p-20 to 0, the sampler will rounding it to
zero. And this will also bring problem when we are using the clamp mode.

This patch is to workaround the above two hardware issues.
It introduces a new intrinsic get_sampler_info to get a sampler type
at runtime. When calling to read_image, it will check whether it
hits the above two cases. If it hit case 1, then we will force it to
use clamp to edge for those pixels within the box, And for those
pixel out of the box, we manually set the border color. To achieve this
solution, we have to prepare two sampler slot for each CL_ADDRESS_CLAMP
sampler. And the first has slot_1 which is using CL_ADDRESS_CLAMP,
the second use slot_1 + 8. Thus we can only use half of 16 samplers.
Fortunately, 8 samplers comply with the OpenCL's minimal requirement.

If it hits case 2, then we minor a epsilon to the coordinate, and
let it not rounds to zero.

If possible, programer should avoid to use float coordinates and/or int/uint
format image. Otherwise, it will hit the very slow path.

With this workaround, the compiler_copy_image1 can pass now.

Signed-off-by: Zhigang Gong <zhigang.gong@linux.intel.com>
Reviewed-by: "Yang, Rong R" <rong.r.yang@intel.com>
---

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 09e910c..3d18f50 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,6 +51,9 @@ ELSE (EMULATE_IVB)
   ADD_DEFINITIONS(-DEMULATE_GEN=0)
 ENDIF (EMULATE_HSW)
 
+# XXX now hard coded to enable the clamp to border workaround for IVB.
+ADD_DEFINITIONS(-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
+
 IF (USE_FULSIM)
   ADD_DEFINITIONS(-DUSE_FULSIM=1)
 ELSE (USE_FULSIM)
diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
index 8622f3e..476c6f2 100644
--- a/backend/CMakeLists.txt
+++ b/backend/CMakeLists.txt
@@ -34,7 +34,7 @@ else (GBE_DEBUG_MEMORY)
 endif (GBE_DEBUG_MEMORY)
 
 # Hide all symbols and allows the symbols declared as visible to be exported
-set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden")
+set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden ${CMAKE_C_CXX_FLAGS}")
 
 if (COMPILER STREQUAL "GCC")
   set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -funroll-loops -Wstrict-aliasing=2 -fstrict-aliasing -msse2 -msse3 -mssse3 -msse4.1 -fPIC -Wall")
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index 5ef1234..36bf688 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -3,6 +3,7 @@ set (ocl_vector_file ${GBE_SOURCE_DIR}/src/ocl_vector.h)
 set (ocl_as_file ${GBE_SOURCE_DIR}/src/ocl_as.h)
 set (ocl_convert_file ${GBE_SOURCE_DIR}/src/ocl_convert.h)
 set (ocl_stdlib_tmpl_file ${GBE_SOURCE_DIR}/src/ocl_stdlib.tmpl.h)
+set (ocl_common_header_file ${GBE_SOURCE_DIR}/src/ocl_common_defines.h)
 set (ocl_blob_file ${CMAKE_CURRENT_BINARY_DIR}/ocl_stdlib.h)
 set (ocl_blob_cpp_file ${GBE_SOURCE_DIR}/src/ocl_stdlib_str.cpp)
 set (ocl_gen_blob_cmd ${GBE_SOURCE_DIR}/src/update_blob_ocl_header.py)
@@ -32,7 +33,7 @@ add_custom_command(
 add_custom_command(
   OUTPUT ${ocl_blob_file}
   COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_blob_file}
-  DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file})
+  DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_common_header_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file})
 
 
 set (pch_object ${ocl_blob_file}.pch)
@@ -46,7 +47,7 @@ else (LLVM_VERSION_NODOT VERSION_GREATER 32)
         set (clang_cmd -cc1 -x cl -triple ptx32 -emit-pch)
     endif (LLVM_VERSION_NODOT VERSION_GREATER 31)
 endif (LLVM_VERSION_NODOT VERSION_GREATER 32)
-set (clang_cmd ${clang_cmd} -fno-builtin)
+set (clang_cmd ${clang_cmd} -fno-builtin -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
 
 add_custom_command(
      OUTPUT ${pch_object}
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index ac3a243..a55ef04 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -315,10 +315,12 @@ namespace gbe
       GBE_DELETE(this->kernel);
       this->kernel = NULL;
     }
-    if(this->kernel != NULL)
+    if(this->kernel != NULL) {
+      // Align it on 32 bytes properly
+      this->kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
       this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
-    if(this->kernel != NULL)
       this->kernel->ctx = this;
+    }
     return this->kernel;
   }
 
@@ -419,9 +421,11 @@ namespace gbe
     this->newCurbeEntry(GBE_CURBE_LOCAL_ID_X, 0, localIDSize);
     this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Y, 0, localIDSize);
     this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Z, 0, localIDSize);
+    this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32);
     specialRegs.insert(ir::ocl::lid0);
     specialRegs.insert(ir::ocl::lid1);
     specialRegs.insert(ir::ocl::lid2);
+    specialRegs.insert(ir::ocl::samplerinfo);
 
     // Go over all the instructions and find the special register we need
     // to push
@@ -470,7 +474,6 @@ namespace gbe
     // research faster
     std::sort(kernel->patches.begin(), kernel->patches.end());
 
-    // Align it on 32 bytes properly
     kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
   }
 
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 3e539a2..d9ea7ff 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2755,6 +2755,13 @@ namespace gbe
       using namespace ir;
       GenRegister msgPayloads[4];
       GenRegister dst[insn.getDstNum()], src[insn.getSrcNum() - 2];
+      uint32_t srcNum = insn.getSrcNum();
+      uint32_t samplerOffset = 0;
+      if (srcNum == 6) {
+      /* We have the clamp border workaround. */
+        samplerOffset = insn.getSrc(srcNum - 1).value() * 8;
+        srcNum--;
+      }
 
       for( int i = 0; i < 4; ++i)
         msgPayloads[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
@@ -2762,15 +2769,15 @@ namespace gbe
       for (uint32_t valueID = 0; valueID < insn.getDstNum(); ++valueID)
         dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
 
-      for (uint32_t valueID = 0; valueID < insn.getSrcNum() - 2; ++valueID)
+      for (uint32_t valueID = 0; valueID < srcNum - 2; ++valueID)
         src[valueID] = sel.selReg(insn.getSrc(valueID + 2), insn.getSrcType());
 
       uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
                        (insn.getSrc(SampleInstruction::SURFACE_BTI));
       uint32_t sampler = sel.ctx.getFunction().getSamplerSet()->getIdx
-                           (insn.getSrc(SampleInstruction::SAMPLER_BTI));
+                           (insn.getSrc(SampleInstruction::SAMPLER_BTI)) + samplerOffset;
 
-      sel.SAMPLE(dst, insn.getDstNum(), src, insn.getSrcNum() - 2, msgPayloads, 4, bti, sampler);
+      sel.SAMPLE(dst, insn.getDstNum(), src, srcNum - 2, msgPayloads, 4, bti, sampler);
       return true;
     }
     DECL_CTOR(SampleInstruction, 1, 1);
@@ -2793,7 +2800,7 @@ namespace gbe
         msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
 
       // u, v, w coords should use coord type.
-      for (; valueID < 1 + coordNum; ++valueID)
+      for (; valueID < coordNum; ++valueID)
         src[valueID] = sel.selReg(insn.getSrc(valueID + 1), insn.getCoordType());
 
       for (; (valueID + 1) < insn.getSrcNum(); ++valueID)
@@ -2826,6 +2833,22 @@ namespace gbe
     DECL_CTOR(GetImageInfoInstruction, 1, 1);
   };
 
+  /*! get sampler info instruction pattern. */
+  DECL_PATTERN(GetSamplerInfoInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::GetSamplerInfoInstruction &insn) const
+    {
+      using namespace ir;
+      GenRegister dst, src;
+      dst = sel.selReg(insn.getDst(0), TYPE_U16);
+      src = GenRegister::offset(GenRegister::uw1grf(ocl::samplerinfo), 0, sel.ctx.getFunction().getSamplerSet()->getIdx(insn.getSrc(0)) * 2);
+      src.subphysical = 1;
+      sel.MOV(dst, src);
+      return true;
+    }
+    DECL_CTOR(GetSamplerInfoInstruction, 1, 1);
+  };
+
   /*! Branch instruction pattern */
   DECL_PATTERN(BranchInstruction)
   {
@@ -3000,6 +3023,7 @@ namespace gbe
     this->insert<SelectModifierInstructionPattern>();
     this->insert<SampleInstructionPattern>();
     this->insert<GetImageInfoInstructionPattern>();
+    this->insert<GetSamplerInfoInstructionPattern>();
 
     // Sort all the patterns with the number of instructions they output
     for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 2abfb12..a9132df 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -568,6 +568,7 @@ namespace gbe
     allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Y, ocl::goffset1);
     allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Z, ocl::goffset2);
     allocatePayloadReg(GBE_CURBE_WORK_DIM, ocl::workdim);
+    allocatePayloadReg(GBE_CURBE_SAMPLER_INFO, ocl::samplerinfo);
     allocatePayloadReg(GBE_CURBE_GROUP_NUM_X, ocl::numgroup0);
     allocatePayloadReg(GBE_CURBE_GROUP_NUM_Y, ocl::numgroup1);
     allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2);
@@ -753,7 +754,8 @@ namespace gbe
       }
       GBE_ASSERT(RA.contains(reg.reg()) != false);
       const uint32_t grfOffset = RA.find(reg.reg())->second;
-      const GenRegister dst = setGenReg(reg, grfOffset);
+      const uint32_t suboffset = reg.subphysical ? reg.subnr : 0;
+      const GenRegister dst = setGenReg(reg, grfOffset + suboffset);
       if (reg.quarter != 0)
         return GenRegister::Qn(dst, reg.quarter);
       else
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index ddf53a2..538f16c 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -235,6 +235,7 @@ namespace gbe
     uint32_t nr:8;         //!< Just for some physical registers (acc, null)
     uint32_t subnr:8;      //!< Idem
     uint32_t physical:1;   //!< 1 if physical, 0 otherwise
+    uint32_t subphysical:1;//!< 1 if subnr is physical, 0 otherwise
     uint32_t type:4;       //!< Gen type
     uint32_t file:2;       //!< Register file
     uint32_t negation:1;   //!< For source
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 68bb17e..ffd31d9 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -468,7 +468,9 @@ namespace gbe {
       useless.push_back(str);
       args.push_back(str.c_str());
     }
-
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+    args.push_back("-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND");
+#endif
     args.push_back("-emit-llvm");
     // XXX we haven't implement those builtin functions,
     // so disable it currently.
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index ff4d157..8774344 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -70,6 +70,7 @@ enum gbe_curbe_type {
   GBE_CURBE_GROUP_NUM_Y,
   GBE_CURBE_GROUP_NUM_Z,
   GBE_CURBE_WORK_DIM,
+  GBE_CURBE_SAMPLER_INFO,
   GBE_CURBE_IMAGE_INFO,
   GBE_CURBE_STACK_POINTER,
   GBE_CURBE_KERNEL_ARGUMENT,
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index b3b9e10..0278bc6 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -486,7 +486,7 @@ namespace ir {
       INLINE Type getSrcType(void) const { return this->srcType; }
       INLINE Type getDstType(void) const { return this->dstType; }
 
-      static const uint32_t srcNum = 5;
+      static const uint32_t srcNum = 6;
       static const uint32_t dstNum = 4;
     };
 
@@ -528,6 +528,32 @@ namespace ir {
       Register dst[0];               //!< No dest register
     };
 
+    class ALIGNED_INSTRUCTION GetSamplerInfoInstruction :
+      public BasePolicy,
+      public NSrcPolicy<GetSamplerInfoInstruction, 1>,
+      public NDstPolicy<GetSamplerInfoInstruction, 1>
+    {
+    public:
+      GetSamplerInfoInstruction( Register dst,
+                                 Register src)
+      {
+        this->opcode = OP_GET_SAMPLER_INFO;
+        this->dst[0] = dst;
+        this->src[0] = src;
+      }
+
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << " sampler id %" << this->getSrc(fn, 0)
+            << " %" << this->getDst(fn, 0);
+      }
+
+      Register src[1];                  //!< Surface to get info
+      Register dst[1];                  //!< return value
+      static const uint32_t dstNum = 1;
+    };
+
     class ALIGNED_INSTRUCTION GetImageInfoInstruction :
       public BasePolicy,
       public NSrcPolicy<GetImageInfoInstruction, 1>,
@@ -886,6 +912,9 @@ namespace ir {
     { return true; }
     INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const
     { return true; }
+    INLINE bool GetSamplerInfoInstruction::wellFormed(const Function &fn, std::string &why) const
+    { return true; }
+
 
     // Ensure that types and register family match
     INLINE bool LoadImmInstruction::wellFormed(const Function &fn, std::string &whyNot) const
@@ -1144,6 +1173,10 @@ START_INTROSPECTION(GetImageInfoInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(GetImageInfoInstruction)
 
+START_INTROSPECTION(GetSamplerInfoInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(GetSamplerInfoInstruction)
+
 START_INTROSPECTION(LoadImmInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(LoadImmInstruction)
@@ -1499,6 +1532,10 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
     return internal::GetImageInfoInstruction(infoType, dst, src).convert();
   }
 
+  Instruction GET_SAMPLER_INFO(Register dst, Register src) {
+    return internal::GetSamplerInfoInstruction(dst, src).convert();
+  }
+
   std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
     const Function &fn = insn.getFunction();
     switch (insn.getOpcode()) {
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 0f7df58..3697c17 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -365,6 +365,7 @@ namespace ir {
     };
     uint32_t data;
   } ImageInfoKey;
+
   /*! Get image information */
   class GetImageInfoInstruction : public Instruction {
   public:
@@ -399,6 +400,14 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  /*! Get image information */
+  class GetSamplerInfoInstruction : public Instruction {
+  public:
+
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   /*! Branch instruction is the unified way to branch (with or without
    *  predicate)
    */
@@ -636,6 +645,8 @@ namespace ir {
   Instruction SAMPLE(Tuple dst, Tuple src, Type dstType, Type srcType);
   /*! get image information , such as width/height/depth/... */
   Instruction GET_IMAGE_INFO(int infoType, Tuple dst, Register src);
+  /*! get sampler information  */
+  Instruction GET_SAMPLER_INFO(Register dst, Register src);
   /*! label labelIndex */
   Instruction LABEL(LabelIndex labelIndex);
 
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index f3f2db6..1a9f867 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -72,6 +72,7 @@ DECL_INSN(SAMPLE, SampleInstruction)
 DECL_INSN(SYNC, SyncInstruction)
 DECL_INSN(LABEL, LabelInstruction)
 DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
+DECL_INSN(GET_SAMPLER_INFO, GetSamplerInfoInstruction)
 DECL_INSN(MUL_HI, BinaryInstruction)
 DECL_INSN(I64_MUL_HI, BinaryInstruction)
 DECL_INSN(FBH, UnaryInstruction)
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 927e43d..10e0c59 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -40,7 +40,7 @@ namespace ir {
         "stack_pointer",
         "block_ip",
         "barrier_id", "thread_number",
-        "work_dimension",
+        "work_dimension", "sampler_info"
     };
 
 #if GBE_DEBUG
@@ -76,6 +76,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, barrierid);
       DECL_NEW_REG(FAMILY_DWORD, threadn);
       DECL_NEW_REG(FAMILY_DWORD, workdim);
+      DECL_NEW_REG(FAMILY_WORD, samplerinfo);
     }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index c79bc3b..89dd69f 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -64,7 +64,8 @@ namespace ir {
     static const Register barrierid = Register(20);// barrierid
     static const Register threadn = Register(21);  // number of threads
     static const Register workdim = Register(22);  // work dimention.
-    static const uint32_t regNum = 23;             // number of special registers
+    static const Register samplerinfo = Register(23); // store sampler info.
+    static const uint32_t regNum = 24;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 05b5874..27263f8 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -540,6 +540,8 @@ namespace gbe
     // Emit unary instructions from gen native function
     void emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode);
 
+    ir::Register appendSampler(CallSite::arg_iterator AI);
+
     // These instructions are not supported at all
     void visitVAArgInst(VAArgInst &I) {NOT_SUPPORTED;}
     void visitSwitchInst(SwitchInst &I) {NOT_SUPPORTED;}
@@ -1809,6 +1811,7 @@ namespace gbe
       case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
       case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
       case GEN_OCL_GET_IMAGE_DEPTH:
+      case GEN_OCL_GET_SAMPLER_INFO:
       case GEN_OCL_ATOMIC_ADD0:
       case GEN_OCL_ATOMIC_ADD1:
       case GEN_OCL_ATOMIC_SUB0:
@@ -1952,6 +1955,25 @@ namespace gbe
     ctx.ATOMIC(opcode, dst, addrSpace, srcTuple);
   }
 
+  /* append a new sampler. should be called before any reference to
+   * a sampler_t value. */
+  ir::Register GenWriter::appendSampler(CallSite::arg_iterator AI) {
+    Constant *CPV = dyn_cast<Constant>(*AI);
+    ir::Register sampler;
+    if (CPV != NULL)
+    {
+      // This is not a kernel argument sampler, we need to append it to sampler set,
+      // and allocate a sampler slot for it.
+      auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
+      GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
+      sampler = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx);
+    } else {
+      sampler = this->getRegister(*AI);
+      ctx.getFunction().getSamplerSet()->append(sampler, &ctx);
+    }
+    return sampler;
+  }
+
   void GenWriter::emitCallInst(CallInst &I) {
     if (Function *F = I.getCalledFunction()) {
       if (F->getIntrinsicID() != 0) {
@@ -2092,6 +2114,14 @@ namespace gbe
             ctx.GET_IMAGE_INFO(infoType, dstTuple, surface_id);
             break;
           }
+          case GEN_OCL_GET_SAMPLER_INFO:
+          {
+            GBE_ASSERT(AI != AE);
+            const ir::Register sampler = this->appendSampler(AI); ++AI;
+            const ir::Register reg = this->getRegister(&I, 0);
+            ctx.GET_SAMPLER_INFO(reg, sampler);
+            break;
+          }
           case GEN_OCL_READ_IMAGE0:
           case GEN_OCL_READ_IMAGE1:
           case GEN_OCL_READ_IMAGE2:
@@ -2107,19 +2137,7 @@ namespace gbe
           {
             GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI;
             GBE_ASSERT(AI != AE);
-            Constant *CPV = dyn_cast<Constant>(*AI);
-            ir::Register sampler;
-            if (CPV != NULL)
-            {
-              // This is not a kernel argument sampler, we need to append it to sampler set,
-              // and allocate a sampler slot for it.
-              auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
-              GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
-              sampler = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx);
-            } else {
-              sampler = this->getRegister(*AI);
-              ctx.getFunction().getSamplerSet()->append(sampler, &ctx);
-            }
+            const ir::Register sampler = this->appendSampler(AI);
             ++AI;
 
             GBE_ASSERT(AI != AE); const ir::Register ucoord = this->getRegister(*AI); ++AI;
@@ -2141,8 +2159,19 @@ namespace gbe
             srcTupleData.push_back(ucoord);
             srcTupleData.push_back(vcoord);
             srcTupleData.push_back(wcoord);
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+            GBE_ASSERT(AI != AE); Constant *CPV = dyn_cast<Constant>(*AI);
+            assert(CPV);
+            auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
+            GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
+            ir::Register offsetReg(x.data.u32);
+            srcTupleData.push_back(offsetReg);
+#else
+            ir::Register offsetReg(0);
+#endif
+            srcTupleData.push_back(offsetReg);
             const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], elemNum);
-            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 5);
+            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 6);
 
             ir::Type srcType = ir::TYPE_S32, dstType = ir::TYPE_U32;
 
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 5ea879c..321fc4e 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -42,19 +42,19 @@ DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8,  __gen_ocl_force_simd8)
 DECL_LLVM_GEN_FUNCTION(FORCE_SIMD16, __gen_ocl_force_simd16)
 
 // To read_image functions.
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE0, _Z21__gen_ocl_read_imageijjii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE1, _Z21__gen_ocl_read_imageijjff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE2, _Z22__gen_ocl_read_imageuijjii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE3, _Z22__gen_ocl_read_imageuijjff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE4, _Z21__gen_ocl_read_imagefjjii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE5, _Z21__gen_ocl_read_imagefjjff)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE0, _Z21__gen_ocl_read_imageijjiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE1, _Z21__gen_ocl_read_imageijjffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE2, _Z22__gen_ocl_read_imageuijjiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE3, _Z22__gen_ocl_read_imageuijjffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE4, _Z21__gen_ocl_read_imagefjjiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE5, _Z21__gen_ocl_read_imagefjjffj)
 
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE10, _Z21__gen_ocl_read_imageijjiii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE11, _Z21__gen_ocl_read_imageijjfff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE12, _Z22__gen_ocl_read_imageuijjiii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE13, _Z22__gen_ocl_read_imageuijjfff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE14, _Z21__gen_ocl_read_imagefjjiii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE15, _Z21__gen_ocl_read_imagefjjfff)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE10, _Z21__gen_ocl_read_imageijjiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE11, _Z21__gen_ocl_read_imageijjfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE12, _Z22__gen_ocl_read_imageuijjiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE13, _Z22__gen_ocl_read_imageuijjfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE14, _Z21__gen_ocl_read_imagefjjiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE15, _Z21__gen_ocl_read_imagefjjfffj)
 
 // To write_image functions.
 DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE0, _Z22__gen_ocl_write_imageijiiDv4_i)
@@ -143,3 +143,6 @@ DECL_LLVM_GEN_FUNCTION(I64RHADD, _Z15__gen_ocl_rhaddmm)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
+
+// get sampler info
+DECL_LLVM_GEN_FUNCTION(GET_SAMPLER_INFO, __gen_ocl_get_sampler_info)
diff --git a/backend/src/ocl_common_defines.h b/backend/src/ocl_common_defines.h
index 1ea150b..b736a88 100644
--- a/backend/src/ocl_common_defines.h
+++ b/backend/src/ocl_common_defines.h
@@ -4,6 +4,7 @@
 //
 // Common defines for Image intrinsics
 // Channel order
+#define CLK_HAS_ALPHA(color) (color == CLK_A || color == CLK_RA || color == CLK_RGBA || color == CLK_BGRA || color == CLK_ARGB)
 enum {
   CLK_R = 0x10B0,
   CLK_A = 0x10B1,
@@ -66,54 +67,52 @@ typedef enum clk_channel_type {
 
 typedef enum clk_sampler_type {
     __CLK_ADDRESS_BASE             = 0,
-    CLK_ADDRESS_NONE               = 0 << __CLK_ADDRESS_BASE,
-    CLK_ADDRESS_CLAMP              = 1 << __CLK_ADDRESS_BASE,
-    CLK_ADDRESS_CLAMP_TO_EDGE      = 2 << __CLK_ADDRESS_BASE,
-    CLK_ADDRESS_REPEAT             = 3 << __CLK_ADDRESS_BASE,
-    CLK_ADDRESS_MIRROR             = 4 << __CLK_ADDRESS_BASE,
+    CLK_ADDRESS_NONE               = (0 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_CLAMP              = (1 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_CLAMP_TO_EDGE      = (2 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_REPEAT             = (3 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_MIRROR             = (4 << __CLK_ADDRESS_BASE),
 
 #if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
     CLK_ADDRESS_MIRRORED_REPEAT    = CLK_ADDRESS_MIRROR,
 #endif
-    __CLK_ADDRESS_MASK             = CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP |
+    __CLK_ADDRESS_MASK             = (CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP |
                                      CLK_ADDRESS_CLAMP_TO_EDGE |
-                                     CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR,
+                                     CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR),
     __CLK_ADDRESS_BITS             = 3,        // number of bits required to
                                                // represent address info
 
     __CLK_NORMALIZED_BASE          = __CLK_ADDRESS_BITS,
     CLK_NORMALIZED_COORDS_FALSE    = 0,
-    CLK_NORMALIZED_COORDS_TRUE     = 1 << __CLK_NORMALIZED_BASE,
-    __CLK_NORMALIZED_MASK          = CLK_NORMALIZED_COORDS_FALSE |
-                                     CLK_NORMALIZED_COORDS_TRUE,
+    CLK_NORMALIZED_COORDS_TRUE     = (1 << __CLK_NORMALIZED_BASE),
+    __CLK_NORMALIZED_MASK          = (CLK_NORMALIZED_COORDS_FALSE |
+                                      CLK_NORMALIZED_COORDS_TRUE),
     __CLK_NORMALIZED_BITS          = 1,        // number of bits required to
                                                // represent normalization
-
-    __CLK_FILTER_BASE              = __CLK_NORMALIZED_BASE +
-                                     __CLK_NORMALIZED_BITS,
-    CLK_FILTER_NEAREST             = 0 << __CLK_FILTER_BASE,
-    CLK_FILTER_LINEAR              = 1 << __CLK_FILTER_BASE,
-    CLK_FILTER_ANISOTROPIC         = 2 << __CLK_FILTER_BASE,
-    __CLK_FILTER_MASK              = CLK_FILTER_NEAREST | CLK_FILTER_LINEAR |
-                                     CLK_FILTER_ANISOTROPIC,
+    __CLK_FILTER_BASE              = (__CLK_NORMALIZED_BASE +  __CLK_NORMALIZED_BITS),
+    CLK_FILTER_NEAREST             = (0 << __CLK_FILTER_BASE),
+    CLK_FILTER_LINEAR              = (1 << __CLK_FILTER_BASE),
+    CLK_FILTER_ANISOTROPIC         = (2 << __CLK_FILTER_BASE),
+    __CLK_FILTER_MASK              = (CLK_FILTER_NEAREST | CLK_FILTER_LINEAR |
+                                     CLK_FILTER_ANISOTROPIC),
     __CLK_FILTER_BITS              = 2,        // number of bits required to
                                                // represent address info
 
-    __CLK_MIP_BASE                 = __CLK_FILTER_BASE + __CLK_FILTER_BITS,
-    CLK_MIP_NEAREST                = 0 << __CLK_MIP_BASE,
-    CLK_MIP_LINEAR                 = 1 << __CLK_MIP_BASE,
-    CLK_MIP_ANISOTROPIC            = 2 << __CLK_MIP_BASE,
-    __CLK_MIP_MASK                 = CLK_MIP_NEAREST | CLK_MIP_LINEAR |
-                                     CLK_MIP_ANISOTROPIC,
+    __CLK_MIP_BASE                 = (__CLK_FILTER_BASE + __CLK_FILTER_BITS),
+    CLK_MIP_NEAREST                = (0 << __CLK_MIP_BASE),
+    CLK_MIP_LINEAR                 = (1 << __CLK_MIP_BASE),
+    CLK_MIP_ANISOTROPIC            = (2 << __CLK_MIP_BASE),
+    __CLK_MIP_MASK                 = (CLK_MIP_NEAREST | CLK_MIP_LINEAR |
+                                     CLK_MIP_ANISOTROPIC),
     __CLK_MIP_BITS                 = 2,
 
-    __CLK_SAMPLER_BITS             = __CLK_MIP_BASE + __CLK_MIP_BITS,
-    __CLK_SAMPLER_MASK             = __CLK_MIP_MASK | __CLK_FILTER_MASK |
-                                     __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
+    __CLK_SAMPLER_BITS             = (__CLK_MIP_BASE + __CLK_MIP_BITS),
+    __CLK_SAMPLER_MASK             = (__CLK_MIP_MASK | __CLK_FILTER_MASK |
+                                      __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK),
 
-    __CLK_SAMPLER_ARG_BASE         = __CLK_MIP_BASE + __CLK_SAMPLER_BITS,
+    __CLK_SAMPLER_ARG_BASE         = (__CLK_MIP_BASE + __CLK_SAMPLER_BITS),
     __CLK_SAMPLER_ARG_BITS         = 8,
-    __CLK_SAMPLER_ARG_MASK         = ((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE,
+    __CLK_SAMPLER_ARG_MASK         = (((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE),
     __CLK_SAMPLER_ARG_KEY_BIT      = (1 << (__CLK_SAMPLER_ARG_BASE + __CLK_SAMPLER_ARG_BITS)),
     __CLK_SAMPLER_ARG_KEY_BITS     = 1,
 
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index ff6f251..26fa8b4 100644
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -2255,19 +2255,19 @@ int __gen_ocl_force_simd16(void);
 // Image access functions
 /////////////////////////////////////////////////////////////////////////////
 
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v);
-
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, float w);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, float w);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, float w);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
+
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
 
 OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
 OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color);
@@ -2287,22 +2287,63 @@ int __gen_ocl_get_image_height(uint surface_id);
 int __gen_ocl_get_image_channel_data_type(uint surface_id);
 int __gen_ocl_get_image_channel_order(uint surface_id);
 int __gen_ocl_get_image_depth(uint surface_id);
+ushort __gen_ocl_get_sampler_info(uint sampler_id);
 
 #define GET_IMAGE(cl_image, surface_id) \
     uint surface_id = (uint)cl_image
 
-#define DECL_READ_IMAGE(image_type, type, suffix, coord_type) \
-  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, sampler_t sampler, coord_type coord) \
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    return __gen_ocl_read_image ##suffix(EXPEND_READ_COORD(surface_id, sampler, coord));\
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+#define GEN_FIX_1 1
+#else
+#define GEN_FIX_1 0
+#endif
+
+#define DECL_READ_IMAGE(float_coord_rounding_fix, int_clamping_fix,          \
+                        image_type, type, suffix, coord_type)                \
+  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
+                                               sampler_t sampler,            \
+                                               coord_type coord)             \
+  {                                                                          \
+    GET_IMAGE(cl_image, surface_id);                                         \
+    coord_type tmpCoord = coord;                                             \
+    ushort samplerValue;                                                     \
+    if (float_coord_rounding_fix | int_clamping_fix) {                       \
+      samplerValue = __gen_ocl_get_sampler_info(sampler);                    \
+      if (((samplerValue & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)         \
+          && ((samplerValue & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {   \
+        if (float_coord_rounding_fix                                         \
+            && ((samplerValue & CLK_NORMALIZED_COORDS_TRUE) == 0)) {         \
+          FIXUP_FLOAT_COORD(tmpCoord);                                       \
+        }                                                                    \
+        if (int_clamping_fix) {                                              \
+          if (OUT_OF_BOX(tmpCoord, surface_id)) {                            \
+            unsigned int border_alpha;                                       \
+            int order = __gen_ocl_get_image_channel_order(surface_id);       \
+            if (!CLK_HAS_ALPHA(order)) {                                     \
+              border_alpha = 1;                                              \
+            } else                                                           \
+              border_alpha = 0;                                              \
+              return (type)(0, 0, 0, border_alpha);                          \
+          } else                                                             \
+            return   __gen_ocl_read_image ##suffix(                          \
+                        EXPEND_READ_COORD(surface_id, sampler, tmpCoord), 1);\
+       }                                                                     \
+      }                                                                      \
+    }                                                                        \
+    return  __gen_ocl_read_image ##suffix(EXPEND_READ_COORD(surface_id,      \
+                                          sampler, tmpCoord), 0);            \
   }
 
-#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type) \
-  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, coord_type coord) \
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    return __gen_ocl_read_image ##suffix(EXPEND_READ_COORD(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord));\
+#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type)      \
+  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
+                                               coord_type coord)             \
+  {                                                                          \
+    GET_IMAGE(cl_image, surface_id);                                         \
+    return __gen_ocl_read_image ##suffix(                                    \
+           EXPEND_READ_COORD(surface_id,                                     \
+                             CLK_NORMALIZED_COORDS_FALSE                     \
+                             | CLK_ADDRESS_NONE                              \
+                             | CLK_FILTER_NEAREST, coord), 0);               \
   }
 
 #define DECL_WRITE_IMAGE(image_type, type, suffix, coord_type) \
@@ -2315,37 +2356,70 @@ int __gen_ocl_get_image_depth(uint surface_id);
 #define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1
 #define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, color
 
-#define DECL_IMAGE(image_type, type, suffix, n)        \
-  DECL_READ_IMAGE(image_type, type, suffix, int ##n)   \
-  DECL_READ_IMAGE(image_type, type, suffix, float ##n) \
-  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n) \
-  DECL_WRITE_IMAGE(image_type, type, suffix, int ## n)   \
+#define OUT_OF_BOX(coord, surface)                             \
+  (coord.s0 < 0 || coord.s1 < 0                                \
+   || coord.s0 >= __gen_ocl_get_image_width(surface)           \
+   || coord.s1 >= __gen_ocl_get_image_height(surface))
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                            \
+  {                                                            \
+    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)            \
+      tmpCoord.s0 += -0x1p-9;                                  \
+    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f)            \
+      tmpCoord.s1 += -0x1p-9f;                                 \
+  }
+
+#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix, n)                   \
+  DECL_READ_IMAGE(0, int_clamping_fix, image_type, type, suffix, int ##n)           \
+  DECL_READ_IMAGE(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float ##n) \
+  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n)                      \
+  DECL_WRITE_IMAGE(image_type, type, suffix, int ## n)                              \
   DECL_WRITE_IMAGE(image_type, type, suffix, float ## n)
 
-DECL_IMAGE(image2d_t, int4, i, 2)
-DECL_IMAGE(image2d_t, uint4, ui, 2)
-DECL_IMAGE(image2d_t, float4, f, 2)
+DECL_IMAGE(GEN_FIX_1, image2d_t, int4, i, 2)
+DECL_IMAGE(GEN_FIX_1, image2d_t, uint4, ui, 2)
+DECL_IMAGE(0, image2d_t, float4, f, 2)
 
 #undef EXPEND_READ_COORD
 #undef EXPEND_WRITE_COORD
+#undef OUT_OF_BOX
+#undef FIXUP_FLOAT_COORD
 
 #define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, coord.s2
 #define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, coord.s2, color
-
-DECL_IMAGE(image3d_t, int4, i, 4)
-DECL_IMAGE(image3d_t, uint4, ui, 4)
-DECL_IMAGE(image3d_t, float4, f, 4)
-
-DECL_IMAGE(image3d_t, int4, i, 3)
-DECL_IMAGE(image3d_t, uint4, ui, 3)
-DECL_IMAGE(image3d_t, float4, f, 3)
+#define OUT_OF_BOX(coord, surface)                              \
+  (coord.s0 < 0 || coord.s1 < 0 || coord.s2 < 0                 \
+   || coord.s0 >= __gen_ocl_get_image_width(surface)            \
+   || coord.s1 >= __gen_ocl_get_image_height(surface)           \
+   || coord.s2 >= __gen_ocl_get_image_depth(surface))
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                             \
+  {                                                             \
+    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20)              \
+      tmpCoord.s0 += -0x1p-9;                                   \
+    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20)              \
+      tmpCoord.s1 += -0x1p-9;                                   \
+    if (tmpCoord.s2 < 0 && tmpCoord.s2 > -0x1p-20)              \
+      tmpCoord.s2 += -0x1p-9;                                   \
+  } 
+
+DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 4)
+DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 4)
+DECL_IMAGE(0, image3d_t, float4, f, 4)
+
+DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 3)
+DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 3)
+DECL_IMAGE(0, image3d_t, float4, f, 3)
 #undef EXPEND_READ_COORD
 #undef EXPEND_WRITE_COORD
+#undef OUT_OF_BOX
+#undef FIXUP_FLOAT_COORD
 
 #undef DECL_IMAGE
 #undef DECL_READ_IMAGE
 #undef DECL_READ_IMAGE_NOSAMPLER
 #undef DECL_WRITE_IMAGE
+#undef GEN_FIX_1
 
 #define DECL_IMAGE_INFO(image_type)    \
   INLINE_OVERLOADABLE  int get_image_width(image_type image) \
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 68630cf..f2c051b 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -180,6 +180,13 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
 #undef UPLOAD
 
+  /* Upload sampler information. */
+  offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_SAMPLER_INFO, 0);
+  uint32_t i;
+  for(i = 0; i < ker->sampler_sz; i++, offset += 2) {
+    *((uint16_t *) (ker->curbe + offset)) = ker->samplers[i] & 0xFF;
+  }
+
   /* Write identity for the stack pointer. This is required by the stack pointer
    * computation in the kernel
    */
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index 1eb790f..6bfc453 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -45,7 +45,7 @@
 .image3d_max_width = 8192,
 .image3d_max_height = 8192,
 .image3d_max_depth = 8192,
-.max_samplers = 0,
+.max_samplers = 8,
 .mem_base_addr_align = sizeof(cl_uint) * 8,
 .min_data_type_align_size = sizeof(cl_uint),
 .single_fp_config = 0, /* XXX */
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 44f44ef..034ecba 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -832,9 +832,22 @@ static void
 intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
 {
   int index;
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+  assert(sampler_sz <= GEN_MAX_SAMPLERS/2);
+#else
   assert(sampler_sz <= GEN_MAX_SAMPLERS);
-  for(index = 0; index < sampler_sz; index++)
-    intel_gpgpu_insert_sampler(gpgpu, index, samplers[index] & __CLK_SAMPLER_MASK);
+#endif
+  for(index = 0; index < sampler_sz; index++) {
+    intel_gpgpu_insert_sampler(gpgpu, index, samplers[index]);
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+    /* Duplicate the sampler to 8 + index and fixup the address mode
+     * to repeat.*/
+    if ((samplers[index] & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) {
+      intel_gpgpu_insert_sampler(gpgpu, index + 8,
+                                 (samplers[index] & ~__CLK_ADDRESS_MASK) | CLK_ADDRESS_REPEAT);
+    }
+#endif
+  }
 }
 
 static void
diff --git a/utests/compiler_copy_image1.cpp b/utests/compiler_copy_image1.cpp
index 39ff3f5..d469fbd 100644
--- a/utests/compiler_copy_image1.cpp
+++ b/utests/compiler_copy_image1.cpp
@@ -68,4 +68,4 @@ static void compiler_copy_image1(void)
   OCL_UNMAP_BUFFER(5);
 }
 
-MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_copy_image1);
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_image1);