From 455892e4f7823c947824adae3c070ff7d9a11a3a Mon Sep 17 00:00:00 2001
From: Zhigang Gong <zhigang.gong@linux.intel.com>
Date: Sat, 3 Aug 2013 01:53:44 +0800
Subject: [PATCH] GBE: refactor double support.

There are two major issues in double support:
1. Doesn't work at SIMD16 mode.
2. The incorrect usage of vectors. We only need to allocate
those temporary register to contiguous registers.

If you look at the previous implementation of
READ_FLOAT64/WRITE_FLOAT64 in gen_encoder.cpp. You can easily
find it contains many duplicate code and considering the SIMD16
code path never work correctly, it's so difficult to work based
on that code. So I choose to refactor those two major functions.
And refine other parts in the instruction selection stage to fix
the above two major problem with a cleaner code.

Now, it works well on both SIMD16/SIMD8 mode.
Another minor improvement is for the READ_FLOAT64 on SIMD8 mode,
this patch saves one time of send instruction to read all the
8 double data into registers.

Signed-off-by: Zhigang Gong <zhigang.gong@linux.intel.com>
Reviewed-by: Homer Hsing <homer.xing@intel.com>
---
 backend/src/backend/gen_context.cpp        |  21 ++-
 backend/src/backend/gen_encoder.cpp        | 231 +++++++++++++----------------
 backend/src/backend/gen_encoder.hpp        |   7 +-
 backend/src/backend/gen_insn_selection.cpp |  65 +++++---
 backend/src/backend/gen_reg_allocation.cpp |  10 +-
 backend/src/backend/gen_register.hpp       |  25 ++++
 backend/src/llvm/llvm_gen_backend.cpp      |   4 +-
 7 files changed, 201 insertions(+), 162 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index e33d8da..655b1d7 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -354,12 +354,18 @@ namespace gbe
     p->pop();
   }
 
+  //  For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
+  //  then follow the real destination registers.
+  //  For SIMD16, we allocate elemNum temporary registers from dst(0).
   void GenContext::emitReadFloat64Instruction(const SelectionInstruction &insn) {
-    const GenRegister dst = ra->genReg(insn.dst(0));
+    const uint32_t elemNum = insn.extra.elem;
+    const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
+    const GenRegister dst = ra->genReg(insn.dst(tmpRegSize));
+    const GenRegister tmp = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister tempAddr = ra->genReg(insn.src(1));
     const uint32_t bti = insn.extra.function;
-    const uint32_t elemNum = insn.extra.elem;
-    p->READ_FLOAT64(dst, src, bti, elemNum);
+    p->READ_FLOAT64(dst, tmp, tempAddr, src, bti, elemNum);
   }
 
   void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
@@ -370,11 +376,16 @@ namespace gbe
     p->UNTYPED_READ(dst, src, bti, elemNum);
   }
 
+  //  For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
+  //  then follow the real destination registers.
+  //  For SIMD16, we allocate elemNum temporary registers from dst(0).
   void GenContext::emitWriteFloat64Instruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.extra.function;
     const uint32_t elemNum = insn.extra.elem;
-    p->WRITE_FLOAT64(src, bti, elemNum);
+    const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
+    const GenRegister data = ra->genReg(insn.src(tmpRegSize + 1));
+    const uint32_t bti = insn.extra.function;
+    p->WRITE_FLOAT64(src, data, bti, elemNum);
   }
 
   void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index f84c6dd..b2be32f 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -356,103 +356,69 @@ namespace gbe
     0
   };
 
-  static int dst_type(int exec_width) {
-    if (exec_width == 8)
-      return GEN_TYPE_UD;
-    if (exec_width == 16)
-      return GEN_TYPE_UW;
-    NOT_IMPLEMENTED;
-    return 0;
-  }
-
-  void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
-    int w = curr.execWidth;
-    dst = GenRegister::h2(dst);
-    dst.type = GEN_TYPE_UD;
-    src.type = GEN_TYPE_UD;
-    GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2), GEN_TYPE_UD);
-    GenRegister imm4 = GenRegister::immud(4);
-    GenInstruction *insn;
-    insn = next(GEN_OPCODE_SEND);
-    setHeader(insn);
-    setDst(insn, GenRegister::uw16grf(r.nr, 0));
-    setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    setSrc1(insn, GenRegister::immud(0));
-    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
-    push();
-    curr.quarterControl = 0;
-    curr.nibControl = 0;
-    MOV(dst, r);
-    if (w == 8)
-      curr.nibControl = 1;
-    else
-      curr.quarterControl = 1;
-    MOV(GenRegister::suboffset(dst, w), GenRegister::suboffset(r, w / 2));
-    pop();
-    ADD(src, src, imm4);
-    insn = next(GEN_OPCODE_SEND);
-    setHeader(insn);
-    setDst(insn, GenRegister::uw16grf(r.nr, 0));
-    setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    setSrc1(insn, GenRegister::immud(0));
-    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
+  void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum) {
+    GenRegister dst32 = GenRegister::retype(dst, GEN_TYPE_UD);
+    src = GenRegister::retype(src, GEN_TYPE_UD);
+    addr = GenRegister::retype(addr, GEN_TYPE_UD);
+    tmp = GenRegister::retype(tmp, GEN_TYPE_UD);
+    uint32_t originSimdWidth = curr.execWidth;
+    uint32_t originPredicate = curr.predicate;
+    uint32_t originMask = curr.noMask;
     push();
-    curr.quarterControl = 0;
-    curr.nibControl = 0;
-    MOV(GenRegister::suboffset(dst, 1), r);
-    if (w == 8)
-      curr.nibControl = 1;
-    else
-      curr.quarterControl = 1;
-    MOV(GenRegister::suboffset(dst, w + 1), GenRegister::suboffset(r, w / 2));
+    for ( uint32_t channels = 0, currQuarter = GEN_COMPRESSION_Q1;
+          channels < originSimdWidth; channels += 8, currQuarter++) {
+      curr.predicate = GEN_PREDICATE_NONE;
+      curr.noMask = GEN_MASK_DISABLE;
+      curr.execWidth = 8;
+      /* XXX The following instruction is illegal, but it works as SIMD 1*4 mode
+         which is what we want here. */
+      MOV(GenRegister::h2(addr), GenRegister::suboffset(src, channels));
+      ADD(GenRegister::h2(GenRegister::suboffset(addr, 1)), GenRegister::suboffset(src, channels), GenRegister::immd(4));
+      MOV(GenRegister::h2(GenRegister::suboffset(addr, 8)), GenRegister::suboffset(src, channels + 4));
+      ADD(GenRegister::h2(GenRegister::suboffset(addr, 9)), GenRegister::suboffset(src, channels + 4), GenRegister::immd(4));
+      // Let's use SIMD16 to read all bytes for 8 doubles data at one time.
+      curr.execWidth = 16;
+      this->UNTYPED_READ(tmp, addr, bti, elemNum);
+      if (originSimdWidth == 16)
+        curr.quarterControl = currQuarter;
+      curr.predicate = originPredicate;
+      curr.noMask = originMask;
+      // Back to simd8 for correct predication flag.
+      curr.execWidth = 8;
+      MOV(GenRegister::retype(GenRegister::suboffset(dst32, channels * 2), GEN_TYPE_DF), GenRegister::retype(tmp, GEN_TYPE_DF));
+    }
     pop();
   }
 
-  void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t elemNum) {
-    int w = curr.execWidth;
-    GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3), GEN_TYPE_UD);
-    r.type = GEN_TYPE_UD;
-    GenRegister hdr = GenRegister::h2(r);
-    GenRegister src = GenRegister::ud16grf(msg.nr + w / 8, 0);
-    src.hstride = GEN_HORIZONTAL_STRIDE_2;
-    GenRegister data = GenRegister::offset(r, w / 8);
-    GenRegister imm4 = GenRegister::immud(4);
-    MOV(r, GenRegister::ud8grf(msg.nr, 0));
+  void GenEncoder::WRITE_FLOAT64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum) {
+    GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD);
+    msg = GenRegister::retype(msg, GEN_TYPE_UD);
+    int originSimdWidth = curr.execWidth;
+    int originPredicate = curr.predicate;
+    int originMask = curr.noMask;
     push();
-    curr.quarterControl = 0;
-    curr.nibControl = 0;
-    MOV(data, src);
-    if (w == 8)
-      curr.nibControl = 1;
-    else
-      curr.quarterControl = 1;
-    MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w));
-    pop();
-    GenInstruction *insn;
-    insn = next(GEN_OPCODE_SEND);
-    setHeader(insn);
-    setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
-    setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
-    setSrc1(insn, GenRegister::immud(0));
-    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
-
-    ADD(r, GenRegister::ud8grf(msg.nr, 0), imm4);
-    push();
-    curr.quarterControl = 0;
-    curr.nibControl = 0;
-    MOV(data, GenRegister::suboffset(src, 1));
-    if (w == 8)
-      curr.nibControl = 1;
-    else
-      curr.quarterControl = 1;
-    MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w + 1));
+    for (uint32_t half = 0; half < 2; half++) {
+      curr.predicate = GEN_PREDICATE_NONE;
+      curr.noMask = GEN_MASK_DISABLE;
+      curr.execWidth = 8;
+      MOV(GenRegister::suboffset(msg, originSimdWidth), GenRegister::unpacked_ud(data32.nr, data32.subnr + half));
+      if (originSimdWidth == 16) {
+        MOV(GenRegister::suboffset(msg, originSimdWidth + 8), GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half));
+        curr.execWidth = 16;
+      }
+      if (half == 1)
+        ADD(GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(4));
+      curr.predicate = originPredicate;
+      curr.noMask = originMask;
+      this->UNTYPED_WRITE(msg, bti, elemNum);
+    }
+    /* Let's restore the original message(addr) register. */
+    /* XXX could be optimized if we don't allocate the address to the header
+       position of the message. */
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = GEN_MASK_DISABLE;
+    ADD(msg, GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(-4));
     pop();
-    insn = next(GEN_OPCODE_SEND);
-    setHeader(insn);
-    setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
-    setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
-    setSrc1(insn, GenRegister::immud(0));
-    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
   }
 
   void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
@@ -470,7 +436,7 @@ namespace gbe
       NOT_IMPLEMENTED;
 
     this->setHeader(insn);
-    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
     this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
     setDPUntypedRW(this,
@@ -601,25 +567,53 @@ namespace gbe
      return &this->store.back();
   }
 
-  INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
-     if (dst.isdf() && src.isdf()) {
+  INLINE void _handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
+                            GenRegister src0, GenRegister src1 = GenRegister::null()) {
        int w = p->curr.execWidth;
        p->push();
-       p->curr.quarterControl = 0;
        p->curr.nibControl = 0;
        GenInstruction *insn = p->next(opcode);
        p->setHeader(insn);
        p->setDst(insn, dst);
-       p->setSrc0(insn, src);
+       p->setSrc0(insn, src0);
+       if (!GenRegister::isNull(src1))
+         p->setSrc1(insn, src1);
        if (w == 8)
          p->curr.nibControl = 1; // second 1/8 mask
-       else // w == 16
-         p->curr.quarterControl = 1; // second 1/4 mask
        insn = p->next(opcode);
        p->setHeader(insn);
        p->setDst(insn, GenRegister::suboffset(dst, w / 2));
-       p->setSrc0(insn, GenRegister::suboffset(src, w / 2));
+       p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
+       if (!GenRegister::isNull(src1))
+         p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
        p->pop();
+  }
+
+  // Double register accessing is a little special,
+  // Per Gen spec, then only supported mode is SIMD8 and, it only
+  // handles four doubles each time.
+  // We need to lower down SIMD16 to two SIMD8 and lower down SIMD8
+  // to two SIMD1x4.
+  INLINE void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
+                           GenRegister src0, GenRegister src1 = GenRegister::null()) {
+      if (p->curr.execWidth == 8)
+        _handleDouble(p, opcode, dst, src0, src1);
+      else if (p->curr.execWidth == 16) {
+        p->push();
+        p->curr.execWidth = 8;
+        p->curr.quarterControl = GEN_COMPRESSION_Q1;
+        _handleDouble(p, opcode, dst, src0, src1);
+        p->curr.quarterControl = GEN_COMPRESSION_Q2;
+        if (!GenRegister::isNull(src1))
+          src1 = GenRegister::offset(src1, 2);
+        _handleDouble(p, opcode, GenRegister::offset(dst, 2), GenRegister::offset(src0, 2), src1);
+        p->pop();
+      }
+  }
+
+  INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
+     if (dst.isdf() && src.isdf()) {
+       handleDouble(p, opcode, dst, src);
      } else if (needToSplitAlu1(p, dst, src) == false) {
        GenInstruction *insn = p->next(opcode);
        p->setHeader(insn);
@@ -653,25 +647,7 @@ namespace gbe
                    GenRegister src1)
   {
     if (dst.isdf() && src0.isdf() && src1.isdf()) {
-       int w = p->curr.execWidth;
-       p->push();
-       p->curr.quarterControl = 0;
-       p->curr.nibControl = 0;
-       GenInstruction *insn = p->next(opcode);
-       p->setHeader(insn);
-       p->setDst(insn, dst);
-       p->setSrc0(insn, src0);
-       p->setSrc1(insn, src1);
-       if (w == 8)
-         p->curr.nibControl = 1; // second 1/8 mask
-       else // w == 16
-         p->curr.quarterControl = 1; // second 1/4 mask
-       insn = p->next(opcode);
-       p->setHeader(insn);
-       p->setDst(insn, GenRegister::suboffset(dst, w / 2));
-       p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
-       p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
-       p->pop();
+       handleDouble(p, opcode, dst, src0, src1);
     } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
        GenInstruction *insn = p->next(opcode);
        p->setHeader(insn);
@@ -808,7 +784,16 @@ namespace gbe
     r.width = GEN_WIDTH_1;
     r.hstride = GEN_HORIZONTAL_STRIDE_0;
     push();
+    uint32_t width = curr.execWidth;
+    curr.execWidth = 8;
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    curr.quarterControl = GEN_COMPRESSION_Q1;
     MOV(dest, r);
+    if (width == 16) {
+      curr.quarterControl = GEN_COMPRESSION_Q2;
+      MOV(GenRegister::offset(dest, 2), r);
+    }
     pop();
   }
 
@@ -839,14 +824,8 @@ namespace gbe
   void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
     int w = curr.execWidth;
     if (src0.isdf()) {
-      push();
-      curr.execWidth = 16;
-      MOV(dest, src0);
-      if (w == 16) {
-        curr.quarterControl = 1;
-        MOV(GenRegister::QnPhysical(dest, w / 4), GenRegister::QnPhysical(src0, w / 4));
-      }
-      pop();
+      GBE_ASSERT(0); // MOV DF is called from convert instruction,
+                     // We should never convert a df to a df.
     } else {
       GenRegister r0 = GenRegister::h2(r);
       push();
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index d3a7165..86e1a71 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -118,10 +118,11 @@ namespace gbe
     ALU2(LINE)
     ALU2(PLN)
     ALU3(MAD)
-    ALU2(MOV_DF);
+    //ALU2(MOV_DF);
 #undef ALU1
 #undef ALU2
 #undef ALU3
+    void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
     void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
     /*! Barrier message (to synchronize threads of a workgroup) */
     void BARRIER(GenRegister src);
@@ -142,9 +143,9 @@ namespace gbe
     /*! Atomic instructions */
     void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
     /*! Read 64-bits float arrays */
-    void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+    void READ_FLOAT64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum);
     /*! Write 64-bits float arrays */
-    void WRITE_FLOAT64(GenRegister src, uint32_t bti, uint32_t elemNum);
+    void WRITE_FLOAT64(GenRegister src, GenRegister data, uint32_t bti, uint32_t elemNum);
     /*! Untyped read (upto 4 channels) */
     void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
     /*! Untyped write (upto 4 channels) */
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d4be8bf..83d9b00 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -466,9 +466,9 @@ namespace gbe
     /*! Atomic instruction */
     void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
     /*! Read 64 bits float array */
-    void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+    void READ_FLOAT64(Reg addr, Reg tempAddr, const GenRegister *dst, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
     /*! Write 64 bits float array */
-    void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
+    void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
     /*! Untyped read (up to 4 elements) */
     void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
     /*! Untyped write (up to 4 elements) */
@@ -760,12 +760,16 @@ namespace gbe
   void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
   void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
 
+  /* elemNum contains all the temporary register and the
+     real destination registers.*/
   void Selection::Opaque::READ_FLOAT64(Reg addr,
+                                       Reg tempAddr,
                                        const GenRegister *dst,
                                        uint32_t elemNum,
+                                       uint32_t valueNum,
                                        uint32_t bti)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 1);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 2);
     SelectionVector *srcVector = this->appendVector();
     SelectionVector *dstVector = this->appendVector();
 
@@ -773,11 +777,12 @@ namespace gbe
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
       insn->dst(elemID) = dst[elemID];
     insn->src(0) = addr;
+    insn->src(1) = tempAddr;
     insn->extra.function = bti;
-    insn->extra.elem = elemNum;
+    insn->extra.elem = valueNum;
 
-    // Sends require contiguous allocation
-    dstVector->regNum = elemNum;
+    // Only the temporary registers need contiguous allocation
+    dstVector->regNum = elemNum - valueNum;
     dstVector->isSrc = 0;
     dstVector->reg = &insn->dst(0);
 
@@ -814,9 +819,12 @@ namespace gbe
     srcVector->reg = &insn->src(0);
   }
 
+  /* elemNum contains all the temporary register and the
+     real data registers.*/
   void Selection::Opaque::WRITE_FLOAT64(Reg addr,
                                         const GenRegister *src,
                                         uint32_t elemNum,
+                                        uint32_t valueNum,
                                         uint32_t bti)
   {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE_FLOAT64, 0, elemNum+1);
@@ -827,10 +835,10 @@ namespace gbe
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
       insn->src(elemID+1) = src[elemID];
     insn->extra.function = bti;
-    insn->extra.elem = elemNum;
+    insn->extra.elem = valueNum;
 
-    // Sends require contiguous allocation for the sources
-    vector->regNum = elemNum+1;
+    // Only the addr + temporary registers need to be contiguous.
+    vector->regNum = (elemNum - valueNum) + 1;
     vector->reg = &insn->src(0);
     vector->isSrc = 1;
   }
@@ -1871,13 +1879,18 @@ namespace gbe
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
-      vector<GenRegister> dst(valueNum);
-      for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
-        dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
-      dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
-      if (sel.ctx.getSimdWidth() == 16)
-        dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
-      sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti);
+      uint32_t dstID;
+      /* XXX support scalar only right now. */
+      GBE_ASSERT(valueNum == 1);
+
+      // The first 16 DWORD register space is for temporary usage at encode stage.
+      uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
+      GenRegister dst[valueNum + tmpRegNum];
+      for (dstID = 0; dstID < tmpRegNum ; ++dstID)
+        dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD));
+      for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID)
+        dst[dstID] = sel.selReg(insn.getValue(valueID));
+      sel.READ_FLOAT64(addr, sel.selReg(sel.reg(FAMILY_QWORD)), dst, valueNum + tmpRegNum, valueNum, bti);
     }
 
     void emitByteGather(Selection::Opaque &sel,
@@ -1971,15 +1984,19 @@ namespace gbe
       const uint32_t valueNum = insn.getValueNum();
       const uint32_t addrID = ir::StoreInstruction::addressIndex;
       GenRegister addr;
-      vector<GenRegister> value(valueNum);
-
+      uint32_t srcID;
+      /* XXX support scalar only right now. */
+      GBE_ASSERT(valueNum == 1);
       addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
-      for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
-        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
-      value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
-      if (sel.ctx.getSimdWidth() == 16)
-        value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
-      sel.WRITE_FLOAT64(addr, value.data(), value.size(), bti);
+      // The first 16 DWORD register space is for temporary usage at encode stage.
+      uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
+      GenRegister src[valueNum + tmpRegNum];
+      for (srcID = 0; srcID < tmpRegNum; ++srcID)
+        src[srcID] = sel.selReg(sel.reg(FAMILY_DWORD));
+
+      for (uint32_t valueID = 0; valueID < valueNum; ++srcID, ++valueID)
+        src[srcID] = sel.selReg(insn.getValue(valueID));
+      sel.WRITE_FLOAT64(addr, src, valueNum + tmpRegNum, valueNum, bti);
     }
 
     void emitByteScatter(Selection::Opaque &sel,
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index e7c96ac..4ba03ea 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -474,7 +474,12 @@ namespace gbe
       if (it != vectorMap.end()) {
         const SelectionVector *vector = it->second.first;
         const uint32_t simdWidth = ctx.getSimdWidth();
-        const uint32_t alignment = simdWidth * sizeof(uint32_t);
+
+        const ir::RegisterData regData = ctx.sel->getRegisterData(reg);
+        const ir::RegisterFamily family = regData.family;
+        const uint32_t typeSize = familyVectorSize[family];
+        const uint32_t alignment = simdWidth*typeSize;
+
         const uint32_t size = vector->regNum * alignment;
         uint32_t grfOffset;
         while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
@@ -483,7 +488,8 @@ namespace gbe
         }
         for (uint32_t regID = 0; regID < vector->regNum; ++regID, grfOffset += alignment) {
           const ir::Register reg = vector->reg[regID].reg();
-          GBE_ASSERT(RA.contains(reg) == false);
+          GBE_ASSERT(RA.contains(reg) == false
+                     && ctx.sel->getRegisterData(reg).family == family);
           RA.insert(std::make_pair(reg, grfOffset));
         }
       }
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index fedb743..7e48837 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -553,6 +553,11 @@ namespace gbe
                          GEN_HORIZONTAL_STRIDE_1);
     }
 
+    static INLINE bool isNull(GenRegister reg) {
+      return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
+              && reg.nr == GEN_ARF_NULL);
+    }
+
     static INLINE GenRegister acc(void) {
       return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
                          GEN_ARF_ACCUMULATOR,
@@ -832,6 +837,26 @@ namespace gbe
                          GEN_HORIZONTAL_STRIDE_2);
     }
 
+    static INLINE GenRegister packed_ud(uint32_t nr, uint32_t subnr) {
+      return GenRegister(GEN_GENERAL_REGISTER_FILE,
+                         nr,
+                         subnr,
+                         GEN_TYPE_UD,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_4,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+    static INLINE GenRegister unpacked_ud(uint32_t nr, uint32_t subnr) {
+      return GenRegister(GEN_GENERAL_REGISTER_FILE,
+                         nr,
+                         subnr,
+                         GEN_TYPE_UD,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_4,
+                         GEN_HORIZONTAL_STRIDE_2);
+    }
+
     static INLINE GenRegister mask(uint32_t subnr) {
       return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_MASK, subnr);
     }
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index c8c5484..b5963ad 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2371,8 +2371,8 @@ namespace gbe
     // Scalar is easy. We neednot build register tuples
     if (isScalarType(llvmType) == true) {
       const ir::Type type = getType(ctx, llvmType);
-      if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't support SIMD16
-        OCL_SIMD_WIDTH = 8;
+      //if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't support SIMD16
+      //  OCL_SIMD_WIDTH = 8;
       const ir::Register values = this->getRegister(llvmValues);
       if (isLoad)
         ctx.LOAD(type, ptr, addrSpace, dwAligned, values);
-- 
2.7.4