[Beignet] [PATCH 1/3] GBE: Change 64bit integer storage in register

Thu May 22 00:06:52 PDT 2014

Previously, we store low/high half of 64bit together, which need several
32bit instructions to do one 64bit instruction. Now we simply change its
storage in register, low 32bit of all lanes are stored together, and then the
high 32bit of all lanes. This will make long support cleaner and less
32bit instructions needed.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/gen_context.cpp        |  226 +++++-----------------------
 backend/src/backend/gen_encoder.cpp        |   96 +-----------
 backend/src/backend/gen_encoder.hpp        |    6 +-
 backend/src/backend/gen_insn_selection.cpp |   83 +++++-----
 backend/src/backend/gen_reg_allocation.cpp |   33 ++--
 backend/src/backend/gen_register.hpp       |   25 +--
 backend/src/llvm/llvm_gen_backend.cpp      |    5 +
 7 files changed, 124 insertions(+), 350 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index f4c80e3..25f690a 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -213,17 +213,7 @@ namespace gbe
       case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); break;
       case SEL_OP_CONVI64_TO_I:
        {
-        int execWidth = p->curr.execWidth;
-        GenRegister xsrc = src.bottom_half(), xdst = dst;
-        p->push();
-        p->curr.execWidth = 8;
-        for(int i = 0; i < execWidth/4; i ++) {
-          p->curr.chooseNib(i);
-          p->MOV(xdst, xsrc);
-          xdst = GenRegister::suboffset(xdst, 4);
-          xsrc = GenRegister::suboffset(xsrc, 4);
-        }
-        p->pop();
+        p->MOV(dst, src.bottom_half());
         break;
        }
       case SEL_OP_BRC:
@@ -268,28 +258,18 @@ namespace gbe
         p->MOV_DF(dst, src, tmp);
         break;
       case SEL_OP_CONVI_TO_I64: {
-        GenRegister middle;
-        if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_D) {
+        GenRegister middle = src;
+        if(src.type == GEN_TYPE_B || src.type == GEN_TYPE_W) {
           middle = tmp;
-          middle.type = src.is_signed_int() ? GEN_TYPE_D : GEN_TYPE_UD;
+          middle.type = GEN_TYPE_D;
           p->MOV(middle, src);
-        } else {
-          middle = src;
         }
-        int execWidth = p->curr.execWidth;
-        p->push();
-        p->curr.execWidth = 8;
-        for (int nib = 0; nib < execWidth / 4; nib ++) {
-          p->curr.chooseNib(nib);
-          p->MOV(dst.bottom_half(), middle);
-          if(middle.is_signed_int())
-            p->ASR(dst.top_half(), middle, GenRegister::immud(31));
-          else
-            p->MOV(dst.top_half(), GenRegister::immd(0));
-          dst = GenRegister::suboffset(dst, 4);
-          middle = GenRegister::suboffset(middle, 4);
-        }
-        p->pop();
+
+        p->MOV(dst.bottom_half(), middle);
+        if(src.is_signed_int())
+          p->ASR(dst.top_half(this->simdWidth), middle, GenRegister::immud(31));
+        else
+          p->MOV(dst.top_half(this->simdWidth), GenRegister::immud(0));
         break;
       }
       default:
@@ -304,8 +284,10 @@ namespace gbe
     GenRegister tmp = ra->genReg(insn.dst(1));
     switch (insn.opcode) {
       case SEL_OP_I64ADD: {
-        GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
-                    y = GenRegister::suboffset(x, p->curr.execWidth);
+        tmp = GenRegister::retype(tmp, GEN_TYPE_UL);
+        GenRegister x = tmp.bottom_half();
+        GenRegister y = tmp.top_half(this->simdWidth);
+
         loadBottomHalf(x, src0);
         loadBottomHalf(y, src1);
         addWithCarry(x, x, y);
@@ -318,8 +300,10 @@ namespace gbe
         break;
       }
       case SEL_OP_I64SUB: {
-        GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
-                    y = GenRegister::suboffset(x, p->curr.execWidth);
+        tmp = GenRegister::retype(tmp, GEN_TYPE_UL);
+        GenRegister x = tmp.bottom_half();
+        GenRegister y = tmp.top_half(this->simdWidth);
+
         loadBottomHalf(x, src0);
         loadBottomHalf(y, src1);
         subWithBorrow(x, x, y);
@@ -400,21 +384,8 @@ namespace gbe
       case SEL_OP_SEL:  p->SEL(dst, src0, src1); break;
       case SEL_OP_SEL_INT64:
         {
-          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
-                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
-                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
-          int execWidth = p->curr.execWidth;
-          p->push();
-          p->curr.execWidth = 8;
-          for (int nib = 0; nib < execWidth / 4; nib ++) {
-            p->curr.chooseNib(nib);
-            p->SEL(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
-            p->SEL(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
-            xdst = GenRegister::suboffset(xdst, 4);
-            xsrc0 = GenRegister::suboffset(xsrc0, 4);
-            xsrc1 = GenRegister::suboffset(xsrc1, 4);
-          }
-          p->pop();
+          p->SEL(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+          p->SEL(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
         }
         break;
       case SEL_OP_AND:  p->AND(dst, src0, src1, insn.extra.function); break;
@@ -422,59 +393,20 @@ namespace gbe
       case SEL_OP_XOR:  p->XOR(dst, src0, src1, insn.extra.function); break;
       case SEL_OP_I64AND:
         {
-          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
-                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
-                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
-          int execWidth = p->curr.execWidth;
-          p->push();
-          p->curr.execWidth = 8;
-          for (int nib = 0; nib < execWidth / 4; nib ++) {
-            p->curr.chooseNib(nib);
-            p->AND(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
-            p->AND(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
-            xdst = GenRegister::suboffset(xdst, 4),
-            xsrc0 = GenRegister::suboffset(xsrc0, 4),
-            xsrc1 = GenRegister::suboffset(xsrc1, 4);
-          }
-          p->pop();
+          p->AND(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+          p->AND(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
         }
         break;
       case SEL_OP_I64OR:
         {
-          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
-                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
-                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
-          int execWidth = p->curr.execWidth;
-          p->push();
-          p->curr.execWidth = 8;
-          for (int nib = 0; nib < execWidth / 4; nib ++) {
-            p->curr.chooseNib(nib);
-            p->OR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
-            p->OR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
-            xdst = GenRegister::suboffset(xdst, 4),
-            xsrc0 = GenRegister::suboffset(xsrc0, 4),
-            xsrc1 = GenRegister::suboffset(xsrc1, 4);
-          }
-          p->pop();
+          p->OR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+          p->OR(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
         }
         break;
       case SEL_OP_I64XOR:
         {
-          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
-                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
-                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
-          int execWidth = p->curr.execWidth;
-          p->push();
-          p->curr.execWidth = 8;
-          for (int nib = 0; nib < execWidth / 4; nib ++) {
-            p->curr.chooseNib(nib);
-            p->XOR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
-            p->XOR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
-            xdst = GenRegister::suboffset(xdst, 4),
-            xsrc0 = GenRegister::suboffset(xsrc0, 4),
-            xsrc1 = GenRegister::suboffset(xsrc1, 4);
-          }
-          p->pop();
+          p->XOR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+          p->XOR(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
         }
         break;
       case SEL_OP_SHR:  p->SHR(dst, src0, src1); break;
@@ -492,18 +424,8 @@ namespace gbe
           GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
                       xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
                       xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
-          int execWidth = p->curr.execWidth;
-          p->push();
-          p->curr.execWidth = 8;
-          for (int nib = 0; nib < execWidth / 4; nib ++) {
-            p->curr.chooseNib(nib);
-            p->MOV(xdst.top_half(), xsrc0.bottom_half());
-            p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
-            xdst = GenRegister::suboffset(xdst, 4);
-            xsrc0 = GenRegister::suboffset(xsrc0, 4);
-            xsrc1 = GenRegister::suboffset(xsrc1, 4);
-          }
-          p->pop();
+          p->MOV(xdst.top_half(this->simdWidth), xsrc0.bottom_half());
+          p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
         }
         break;
       default: NOT_IMPLEMENTED;
@@ -511,16 +433,10 @@ namespace gbe
   }
 
   void GenContext::collectShifter(GenRegister dest, GenRegister src) {
-    int execWidth = p->curr.execWidth;
     p->push();
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->curr.noMask = 1;
-    p->curr.execWidth = 8;
-    for (int nib = 0; nib < execWidth / 4; nib ++) {
-      p->AND(dest, src.bottom_half(), GenRegister::immud(63));
-      dest = GenRegister::suboffset(dest, 4);
-      src = GenRegister::suboffset(src, 4);
-    }
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+    p->AND(dest, src.bottom_half(), GenRegister::immud(63));
     p->pop();
   }
 
@@ -1267,73 +1183,19 @@ namespace gbe
   }
 
   void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
-    int execWidth = p->curr.execWidth;
-    src = src.top_half();
-    p->push();
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->curr.noMask = 1;
-    p->curr.execWidth = 8;
-    p->MOV(dest, src);
-    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
-    if (execWidth == 16) {
-      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
-      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
-    }
-    p->pop();
+    p->MOV(dest, src.top_half(this->simdWidth));
   }
 
   void GenContext::storeTopHalf(GenRegister dest, GenRegister src) {
-    int execWidth = p->curr.execWidth;
-    dest = dest.top_half();
-    p->push();
-    p->curr.noMask = 0;
-    p->curr.execWidth = 8;
-    p->MOV(dest, src);
-    p->curr.nibControl = 1;
-    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
-    if (execWidth == 16) {
-      p->curr.quarterControl = 1;
-      p->curr.nibControl = 0;
-      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
-      p->curr.nibControl = 1;
-      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
-    }
-    p->pop();
+    p->MOV(dest.top_half(this->simdWidth), src);
   }
 
   void GenContext::loadBottomHalf(GenRegister dest, GenRegister src) {
-    int execWidth = p->curr.execWidth;
-    src = src.bottom_half();
-    p->push();
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->curr.noMask = 1;
-    p->curr.execWidth = 8;
-    p->MOV(dest, src);
-    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
-    if (execWidth == 16) {
-      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
-      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
-    }
-    p->pop();
+    p->MOV(dest, src.bottom_half());
   }
 
   void GenContext::storeBottomHalf(GenRegister dest, GenRegister src) {
-    int execWidth = p->curr.execWidth;
-    dest = dest.bottom_half();
-    p->push();
-    p->curr.execWidth = 8;
-    p->curr.noMask = 0;
-    p->MOV(dest, src);
-    p->curr.nibControl = 1;
-    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
-    if (execWidth == 16) {
-      p->curr.quarterControl = 1;
-      p->curr.nibControl = 0;
-      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
-      p->curr.nibControl = 1;
-      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
-    }
-    p->pop();
+    p->MOV(dest.bottom_half(), src);
   }
 
   void GenContext::addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1) {
@@ -1770,18 +1632,12 @@ namespace gbe
     p->pop();
   }
 
-  //  For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
-  //  then follow the real destination registers.
-  //  For SIMD16, we allocate elemNum temporary registers from dst(0).
   void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
     const uint32_t elemNum = insn.extra.elem;
-    const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
-    const GenRegister tempAddr = ra->genReg(insn.dst(tmpRegSize + 1));
-    const GenRegister dst = ra->genReg(insn.dst(tmpRegSize));
-    const GenRegister tmp = ra->genReg(insn.dst(0));
+    const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
     const uint32_t bti = insn.extra.function;
-    p->READ64(dst, tmp, tempAddr, src, bti, elemNum);
+    p->UNTYPED_READ(dst, src, bti, elemNum*2);
   }
 
   void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
@@ -1792,17 +1648,11 @@ namespace gbe
     p->UNTYPED_READ(dst, src, bti, elemNum);
   }
 
-  //  For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
-  //  then follow the real destination registers.
-  //  For SIMD16, we allocate elemNum temporary registers from dst(0).
   void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.dst(0));
     const uint32_t elemNum = insn.extra.elem;
-    const GenRegister addr = ra->genReg(insn.src(0)); //tmpRegSize + 1));
-    const GenRegister data = ra->genReg(insn.src(1));
     const uint32_t bti = insn.extra.function;
-    p->MOV(src, addr);
-    p->WRITE64(src, data, bti, elemNum, sel->isScalarReg(data.reg()));
+    p->UNTYPED_WRITE(src, bti, elemNum*2);
   }
 
   void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 7078dcb..7ecb4c4 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -216,6 +216,7 @@ namespace gbe
   GenEncoder::GenEncoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID, int jump_width) :
     stateNum(0), gen(gen), deviceID(deviceID), jump_width(jump_width)
   {
+    this->simdWidth = simdWidth;
     this->curr.execWidth = simdWidth;
     this->curr.quarterControl = GEN_COMPRESSION_Q1;
     this->curr.noMask = 0;
@@ -370,76 +371,6 @@ namespace gbe
     0
   };
 
-  void GenEncoder::READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum) {
-    GenRegister dst32 = GenRegister::retype(dst, GEN_TYPE_UD);
-    src = GenRegister::retype(src, GEN_TYPE_UD);
-    addr = GenRegister::retype(addr, GEN_TYPE_UD);
-    tmp = GenRegister::retype(tmp, GEN_TYPE_UD);
-    uint32_t originSimdWidth = curr.execWidth;
-    uint32_t originPredicate = curr.predicate;
-    uint32_t originMask = curr.noMask;
-    push();
-    for ( uint32_t channels = 0, currQuarter = GEN_COMPRESSION_Q1;
-          channels < originSimdWidth; channels += 8, currQuarter++) {
-      curr.predicate = GEN_PREDICATE_NONE;
-      curr.noMask = GEN_MASK_DISABLE;
-      curr.execWidth = 8;
-      /* XXX The following instruction is illegal, but it works as SIMD 1*4 mode
-         which is what we want here. */
-      MOV(GenRegister::h2(addr), GenRegister::suboffset(src, channels));
-      ADD(GenRegister::h2(GenRegister::suboffset(addr, 1)), GenRegister::suboffset(src, channels), GenRegister::immd(4));
-      MOV(GenRegister::h2(GenRegister::suboffset(addr, 8)), GenRegister::suboffset(src, channels + 4));
-      ADD(GenRegister::h2(GenRegister::suboffset(addr, 9)), GenRegister::suboffset(src, channels + 4), GenRegister::immd(4));
-      // Let's use SIMD16 to read all bytes for 8 doubles data at one time.
-      curr.execWidth = 16;
-      this->UNTYPED_READ(tmp, addr, bti, elemNum);
-      if (originSimdWidth == 16)
-        curr.quarterControl = currQuarter;
-      curr.predicate = originPredicate;
-      curr.noMask = originMask;
-      // Back to simd8 for correct predication flag.
-      curr.execWidth = 8;
-      MOV(GenRegister::retype(GenRegister::suboffset(dst32, channels * 2), GEN_TYPE_DF), GenRegister::retype(tmp, GEN_TYPE_DF));
-    }
-    pop();
-  }
-
-  void GenEncoder::WRITE64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar) {
-    GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD);
-    GenRegister unpacked;
-    msg = GenRegister::retype(msg, GEN_TYPE_UD);
-    int originSimdWidth = curr.execWidth;
-    int originPredicate = curr.predicate;
-    int originMask = curr.noMask;
-    push();
-    for (uint32_t half = 0; half < 2; half++) {
-      curr.predicate = GEN_PREDICATE_NONE;
-      curr.noMask = GEN_MASK_DISABLE;
-      curr.execWidth = 8;
-      if (is_scalar) {
-        unpacked = data32;
-        unpacked.subnr += half * 4;
-      } else
-        unpacked = GenRegister::unpacked_ud(data32.nr, data32.subnr + half);
-      MOV(GenRegister::suboffset(msg, originSimdWidth), unpacked);
-      if (originSimdWidth == 16) {
-        if (is_scalar) {
-          unpacked = data32;
-          unpacked.subnr += half * 4;
-        } else
-          unpacked = GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half);
-        MOV(GenRegister::suboffset(msg, originSimdWidth + 8), unpacked);
-        curr.execWidth = 16;
-      }
-      if (half == 1)
-        ADD(GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(4));
-      curr.predicate = originPredicate;
-      curr.noMask = originMask;
-      this->UNTYPED_WRITE(msg, bti, elemNum);
-    }
-    pop();
-  }
-
   void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
@@ -672,17 +603,8 @@ namespace gbe
      if (dst.isdf() && src.isdf()) {
        handleDouble(p, opcode, dst, src);
      } else if (dst.isint64() && src.isint64()) { // handle int64
-       int execWidth = p->curr.execWidth;
-       p->push();
-       p->curr.execWidth = 8;
-       for (int nib = 0; nib < execWidth / 4; nib ++) {
-         p->curr.chooseNib(nib);
-         p->MOV(dst.bottom_half(), src.bottom_half());
-         p->MOV(dst.top_half(), src.top_half());
-         dst = GenRegister::suboffset(dst, 4);
-         src = GenRegister::suboffset(src, 4);
-       }
-       p->pop();
+       p->MOV(dst.bottom_half(), src.bottom_half());
+       p->MOV(dst.top_half(p->simdWidth), src.top_half(p->simdWidth));
      } else if (needToSplitAlu1(p, dst, src) == false) {
       if(compactAlu1(p, opcode, dst, src, condition, false))
         return;
@@ -915,16 +837,8 @@ namespace gbe
 
   void GenEncoder::LOAD_INT64_IMM(GenRegister dest, int64_t value) {
     GenRegister u0 = GenRegister::immd((int)value), u1 = GenRegister::immd(value >> 32);
-    int execWidth = curr.execWidth;
-    push();
-    curr.execWidth = 8;
-    for(int nib = 0; nib < execWidth/4; nib ++) {
-      curr.chooseNib(nib);
-      MOV(dest.top_half(), u1);
-      MOV(dest.bottom_half(), u0);
-      dest = GenRegister::suboffset(dest, 4);
-    }
-    pop();
+    MOV(dest.bottom_half(), u0);
+    MOV(dest.top_half(this->simdWidth), u1);
   }
 
   void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index e0bb4cc..627a311 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -90,6 +90,8 @@ namespace gbe
     uint32_t deviceID;
     /*! The constant for jump. */
     const int jump_width;
+    /*! simd width for this codegen */
+    uint32_t simdWidth;
     ////////////////////////////////////////////////////////////////////////
     // Encoding functions
     ////////////////////////////////////////////////////////////////////////
@@ -168,10 +170,6 @@ namespace gbe
     void WAIT(void);
     /*! Atomic instructions */
     virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
-    /*! Read 64-bits float/int arrays */
-    void READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum);
-    /*! Write 64-bits float/int arrays */
-    void WRITE64(GenRegister src, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar);
     /*! Untyped read (upto 4 channels) */
     virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
     /*! Untyped write (upto 4 channels) */
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 2ab3aae..3f7154f 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -536,9 +536,9 @@ namespace gbe
     /*! Atomic instruction */
     void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
     /*! Read 64 bits float/int array */
-    void READ64(Reg addr, Reg tempAddr, const GenRegister *dst, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
+    void READ64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
     /*! Write 64 bits float/int array */
-    void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, const GenRegister *dst, uint32_t dstNum, uint32_t bti);
+    void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, uint32_t bti);
     /*! Untyped read (up to 4 elements) */
     void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
     /*! Untyped write (up to 4 elements) */
@@ -1033,34 +1033,27 @@ namespace gbe
   void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
   void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
 
-  /* elemNum contains all the temporary register and the
-     real destination registers.*/
   void Selection::Opaque::READ64(Reg addr,
-                                 Reg tempAddr,
                                  const GenRegister *dst,
                                  uint32_t elemNum,
-                                 uint32_t valueNum,
                                  uint32_t bti)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum + 1, 1);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum, 1);
     SelectionVector *srcVector = this->appendVector();
     SelectionVector *dstVector = this->appendVector();
 
     // Regular instruction to encode
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
       insn->dst(elemID) = dst[elemID];
-    /* temporary addr register is to be modified, set it to dst registers.*/
-    insn->dst(elemNum) = tempAddr;
+
     insn->src(0) = addr;
     insn->extra.function = bti;
-    insn->extra.elem = valueNum;
+    insn->extra.elem = elemNum;
 
-    // Only the temporary registers need contiguous allocation
-    dstVector->regNum = elemNum - valueNum;
+    dstVector->regNum = elemNum;
     dstVector->isSrc = 0;
     dstVector->reg = &insn->dst(0);
 
-    // Source cannot be scalar (yet)
     srcVector->regNum = 1;
     srcVector->isSrc = 1;
     srcVector->reg = &insn->src(0);
@@ -1087,36 +1080,30 @@ namespace gbe
     dstVector->regNum = elemNum;
     dstVector->isSrc = 0;
     dstVector->reg = &insn->dst(0);
-    // Source cannot be scalar (yet)
+
     srcVector->regNum = 1;
     srcVector->isSrc = 1;
     srcVector->reg = &insn->src(0);
   }
 
-  /* elemNum contains all the temporary register and the
-     real data registers.*/
   void Selection::Opaque::WRITE64(Reg addr,
                                   const GenRegister *src,
                                   uint32_t srcNum,
-                                  const GenRegister *dst,
-                                  uint32_t dstNum,
                                   uint32_t bti)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum + 1);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, 0, srcNum + 1);
     SelectionVector *vector = this->appendVector();
 
     // Regular instruction to encode
     insn->src(0) = addr;
     for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
       insn->src(elemID + 1) = src[elemID];
-    for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
-      insn->dst(elemID) = dst[elemID];
+
     insn->extra.function = bti;
     insn->extra.elem = srcNum;
 
-    // Only the addr + temporary registers need to be contiguous.
-    vector->regNum = dstNum;
-    vector->reg = &insn->dst(0);
+    vector->regNum = srcNum + 1;
+    vector->reg = &insn->src(0);
     vector->isSrc = 1;
   }
 
@@ -2643,18 +2630,13 @@ namespace gbe
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
-      uint32_t dstID;
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
 
-      // The first 16 DWORD register space is for temporary usage at encode stage.
-      uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
-      GenRegister dst[valueNum + tmpRegNum];
-      for (dstID = 0; dstID < tmpRegNum ; ++dstID)
-        dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD));
-      for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID)
-        dst[dstID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
-      sel.READ64(addr, sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64), dst, valueNum + tmpRegNum, valueNum, bti);
+      GenRegister dst[valueNum];
+      for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
+        dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
+      sel.READ64(addr, dst, valueNum, bti);
     }
 
     void emitByteGather(Selection::Opaque &sel,
@@ -2803,22 +2785,14 @@ namespace gbe
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
-      uint32_t srcID;
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
-      addr = GenRegister::retype(addr, GEN_TYPE_F);
-      // The first 16 DWORD register space is for temporary usage at encode stage.
-      uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
+      addr = GenRegister::retype(addr, GEN_TYPE_UD);
       GenRegister src[valueNum];
-      GenRegister dst[tmpRegNum + 1];
-      /* dst 0 is for the temporary address register. */
-      dst[0] = sel.selReg(sel.reg(FAMILY_DWORD));
-      for (srcID = 0; srcID < tmpRegNum; ++srcID)
-        dst[srcID + 1] = sel.selReg(sel.reg(FAMILY_DWORD));
 
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
         src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
-      sel.WRITE64(addr, src, valueNum, dst, tmpRegNum + 1, bti);
+      sel.WRITE64(addr, src, valueNum, bti);
     }
 
     void emitByteScatter(Selection::Opaque &sel,
@@ -3009,6 +2983,11 @@ namespace gbe
         narrowNum = srcNum;
         narrowDst = 0;
       }
+      // As we store long/ulong low/high part separately,
+      // we need to deal with it separately, we need to change it back again
+      // when hardware support native long type.
+      const bool isInt64 = (srcType == TYPE_S64 || srcType == TYPE_U64 || dstType == TYPE_S64 || dstType == TYPE_U64);
+      const int simdWidth = sel.curr.execWidth;
 
       for(int i = 0; i < narrowNum; i++, index++) {
         GenRegister narrowReg, wideReg;
@@ -3030,16 +3009,26 @@ namespace gbe
             GBE_ASSERT(multiple == 8);
           }
         }
-        if(index % multiple) {
+
+        if(!isInt64 && index % multiple) {
           wideReg = GenRegister::offset(wideReg, 0, (index % multiple) * typeSize(wideReg.type));
           wideReg.subphysical = 1;
         }
+        if(isInt64) {
+          // offset to next half
+          wideReg.subphysical = 1;
+          if(i >= multiple/2)
+            wideReg = GenRegister::offset(wideReg, 0, sel.isScalarReg(wideReg.reg()) ? 4 : simdWidth*4);
+          if(index % (multiple/2))
+            wideReg = GenRegister::offset(wideReg, 0, (index % (multiple/2)) * typeSize(wideReg.type));
+        }
+
         GenRegister xdst = narrowDst ? narrowReg : wideReg;
         GenRegister xsrc = narrowDst ? wideReg : narrowReg;
 
-        if((srcType == TYPE_S64 || srcType == TYPE_U64 || srcType == TYPE_DOUBLE) ||
-           (dstType == TYPE_S64 || dstType == TYPE_U64 || dstType == TYPE_DOUBLE)) {
-          const int simdWidth = sel.curr.execWidth;
+        if(isInt64) {
+          sel.MOV(xdst, xsrc);
+        } else if(srcType == TYPE_DOUBLE || dstType == TYPE_DOUBLE) {
           sel.push();
             sel.curr.execWidth = 8;
             xdst.subphysical = 1;
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 880a267..bab65e5 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -661,25 +661,38 @@ namespace gbe
            != spilledRegs.end())
           continue;
 
-        uint32_t alignment;
-        ir::RegisterFamily family;
-        getRegAttrib(reg, alignment, &family);
-        const uint32_t size = vector->regNum * alignment;
-        const uint32_t grfOffset = allocateReg(interval, size, alignment);
+        uint32_t alignment, maxAlignment = 0;
+        uint32_t size = 0;
+        for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
+          getRegAttrib(reg, alignment, NULL);
+          if(alignment > maxAlignment)
+            maxAlignment = alignment;
+          size += alignment;
+        }
+
+        const uint32_t grfOffset = allocateReg(interval, size, maxAlignment);
         if(grfOffset == 0) {
-          GBE_ASSERT(!(reservedReg && family != ir::FAMILY_DWORD));
+          ir::RegisterFamily family;
           for(int i = vector->regNum-1; i >= 0; i--) {
+            family = ctx.sel->getRegisterFamily(vector->reg[i].reg());
+            // we currently only support DWORD/QWORD spill
+            if(family != ir::FAMILY_DWORD && family != ir::FAMILY_QWORD)
+              return false;
             if (!spillReg(vector->reg[i].reg()))
               return false;
           }
           continue;
         }
+        uint32_t subOffset = 0;
         for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
           const ir::Register reg = vector->reg[regID].reg();
-          GBE_ASSERT(RA.contains(reg) == false
-                     && ctx.sel->getRegisterData(reg).family == family);
-          insertNewReg(reg, grfOffset + alignment * regID, true);
-          ctx.splitBlock(grfOffset, alignment * regID);  //splitBlock will not split if regID == 0
+          GBE_ASSERT(RA.contains(reg) == false);
+          getRegAttrib(reg, alignment, NULL);
+          // check all sub registers aligned correctly
+          GBE_ASSERT((grfOffset + subOffset) % alignment == 0 || (grfOffset + subOffset) % GEN_REG_SIZE == 0);
+          insertNewReg(reg, grfOffset + subOffset, true);
+          ctx.splitBlock(grfOffset, subOffset);  //splitBlock will not split if regID == 0
+          subOffset += alignment;
         }
       }
       // Case 2: This is a regular scalar register, allocate it alone
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 50a6dcd..3967e6e 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -301,20 +301,25 @@ namespace gbe
       return false;
     }
 
-    INLINE GenRegister top_half(void) const {
-      GenRegister r = bottom_half();
-      r.subnr += 4;
-      r.nr += r.subnr / 32;
-      r.subnr %= 32;
-      return r;
+    INLINE GenRegister top_half(int simdWidth) const {
+      GBE_ASSERT(isint64());
+      GenRegister reg = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D);
+
+      if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+        reg.subnr += simdWidth * typeSize(reg.type) * hstride_size(reg);
+        reg.nr += reg.subnr / 32;
+        reg.subnr %= 32;
+      } else {
+        reg.subnr += typeSize(reg.type);
+        reg.nr += reg.subnr/32;
+        reg.subnr %= 32;
+      }
+      return reg;
     }
 
     INLINE GenRegister bottom_half(void) const {
       GBE_ASSERT(isint64());
-      GenRegister r = h2(*this);
-      r.type = type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D;
-      if(r.vstride != GEN_VERTICAL_STRIDE_0)
-       r.vstride = GEN_VERTICAL_STRIDE_16;
+      GenRegister r = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D);
       return r;
     }
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 82429d0..bd111b0 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1868,6 +1868,11 @@ namespace gbe
         uint32_t srcElemNum = 0, dstElemNum = 0 ;
         ir::Type srcType = getVectorInfo(ctx, srcValue->getType(), srcValue, srcElemNum);
         ir::Type dstType = getVectorInfo(ctx, dstValue->getType(), dstValue, dstElemNum);
+        // As long and double are not compatible in register storage
+        // and we do not support double yet, simply put an assert here
+        GBE_ASSERT(!(srcType == ir::TYPE_S64 && dstType == ir::TYPE_DOUBLE));
+        GBE_ASSERT(!(dstType == ir::TYPE_S64 && srcType == ir::TYPE_DOUBLE));
+
         if(srcElemNum > 1 || dstElemNum > 1) {
           // Build the tuple data in the vector
           vector<ir::Register> srcTupleData;
-- 
1.7.10.4