[Beignet] [PATCH 4/4] gbe: Implement a new BTI solution to support dynamic bti

Ruiling Song ruiling.song at intel.com
Wed Apr 29 20:49:49 PDT 2015


while the old implementation analyze statically the pointer base, and thus
manage compile time BTIs for all memory access instruction. The new implementation
introduce a virtual register to hold the BTI value for the memory access instruction.
The main benefit of this new method is it can handle storing/loading pointers.
This is a big step towards supporting storing/loading pointers

consider following example:
void @compiler_mixed_pointer1(i32 addrspace(1)* readonly %src, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2) {
  %cmp = icmp slt i32 %add4.i, 5
  %cond = select i1 %cmp, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2
  store i32 %6, i32 addrspace(1)* %10, align 4, !tbaa !31
}

will be changed to:

void @compiler_mixed_pointer1(i32 addrspace(1)* readonly %src, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2) {
  %cmp = icmp slt i32 %add4.i, 5

  // new added instruction:
  // %0 hold the value of BTIs, '3' is bti of dst1, '4' is the bti of dst2
  // %1 holds the value of starting address for the BTIs, which will be subtracted.

  %0 = select i1 %cmp, i32 3, i32 4
  %1 = select i1 %cmp, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2

  %cond = select i1 %cmp, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2
  store i32 %cond, i32 addrspace(1)* %10, align 4
}

The idea of the solution is: check bti register and select one lane of bti that is not accessed (through 'lzd').
and issue the send message to the bti, and continue get the un-accessed lanes and repeat the steps.

for mixed pointer, the final asm looks like below:
(g118 (offset 0xec0) is register holds bti of all lanes)
((31-lzd(active_lane_mask))*4 + bti_reg_start) is the target bti for this iteration

As the gen backend currently only allow one flag register for one selectionInstruction,
so I have to store the flag at (54) and load at (64) at the example below.

    (      38)  mov(1)          f0.1<2>:UW      0x0UW                           { align1 WE_all };
    (      40)  cmp.ne.f0.1(16) null:F          f0.1<0,1,0>:UW  0x1UW           { align1 WE_normal 1H switch };
    (      42)  and(1)          g8.2<1>:UD      f0.1<0,1,0>:UW  0xffffffffUD    { align1 WE_all };
    (      44)  lzd(1)          g8.2<1>:UD      g8.2<0,1,0>:UD                  { align1 WE_all };
    (      46)  add(1)          g8.4<2>:UW      -g8.4<0,1,0>:UW 0x1fUW          { align1 WE_all };
    (      48)  mul(1)          g8.4<2>:UW      g8.4<0,1,0>:UW  0x4UW           { align1 WE_all };
    (      50)  add(1)          a0<2>:UW        g8.4<0,1,0>:UW  0xec0UD         { align1 WE_all };
    (      52)  mov(1)          g8.2<1>:UD      g[a0]<0,1,0>:UD                 { align1 WE_all };
    (      54)  mov(1)          g121.14<2>:UW   f0.1<0,1,0>:UW                  { align1 WE_all };
    (      56)  cmp.e.f0.1(8)   null:F          g118<8,8,1>:UD  g8.2<0,1,0>:UD  { align1 WE_normal 1Q switch };
    (      58)  cmp.e.f0.1(8)   null:F          g119<8,8,1>:UD  g8.2<0,1,0>:UD  { align1 WE_normal 2Q switch };
    (      60)  or(1)           a0<1>:UD        g8.8<0,1,0>:UB  0x8035e00UD     { align1 WE_all };
    (      62)  (+f0.1) send(16) null:UW        g104<8,8,1>:UD  a0<0,1,0>:UW
                data                                            { align1 WE_normal 1H };
    (      64)  mov(1)          f0.1<2>:UW      g121.14<0,1,0>:UW               { align1 WE_all };
    (      66)  (+f0.1) cmp.ne.f0.1(8) null:F   g118<8,8,1>:UD  g8.2<0,1,0>:UD  { align1 WE_normal 1Q switch };
    (      68)  (+f0.1) cmp.ne.f0.1(8) null:F   g119<8,8,1>:UD  g8.2<0,1,0>:UD  { align1 WE_normal 2Q switch };
    (      70)  (+f0.1) while(16) -28                                           { align1 WE_normal 1H };

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/gen/gen_mesa_disasm.c  | 100 ++---
 backend/src/backend/gen75_encoder.cpp      |  80 +++-
 backend/src/backend/gen75_encoder.hpp      |   9 +-
 backend/src/backend/gen8_context.cpp       |  49 ++-
 backend/src/backend/gen8_encoder.cpp       |  79 +++-
 backend/src/backend/gen8_encoder.hpp       |   9 +-
 backend/src/backend/gen_context.cpp        | 204 +++++++++-
 backend/src/backend/gen_context.hpp        |   2 +
 backend/src/backend/gen_encoder.cpp        | 143 +++++--
 backend/src/backend/gen_encoder.hpp        |  16 +-
 backend/src/backend/gen_insn_selection.cpp | 465 +++++++++++++---------
 backend/src/backend/gen_insn_selection.hpp |  20 +-
 backend/src/backend/gen_register.hpp       |   2 +
 backend/src/backend/program.h              |   1 +
 backend/src/ir/context.hpp                 |   8 +-
 backend/src/ir/instruction.cpp             | 109 ++++--
 backend/src/ir/instruction.hpp             |  36 +-
 backend/src/ir/profile.cpp                 |   4 +-
 backend/src/ir/profile.hpp                 |   3 +-
 backend/src/llvm/llvm_gen_backend.cpp      | 606 +++++++++++++++++++++++------
 20 files changed, 1405 insertions(+), 540 deletions(-)

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 711b943..5f5fd3a 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -99,8 +99,8 @@ static const struct {
   [GEN_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
 
-  [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
-  [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
@@ -1249,59 +1249,61 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
                      target, &space);
     }
 
-    switch (target) {
-      case GEN_SFID_SAMPLER:
-        format(file, " (%d, %d, %d, %d)",
-               SAMPLE_BTI(inst),
-               SAMPLER(inst),
-               SAMPLER_MSG_TYPE(inst),
-               SAMPLER_SIMD_MODE(inst));
-        break;
-      case GEN_SFID_DATAPORT_DATA:
-        if(UNTYPED_RW_CATEGORY(inst) == 0) {
+    if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE) {
+      switch (target) {
+        case GEN_SFID_SAMPLER:
+          format(file, " (%d, %d, %d, %d)",
+                 SAMPLE_BTI(inst),
+                 SAMPLER(inst),
+                 SAMPLER_MSG_TYPE(inst),
+                 SAMPLER_SIMD_MODE(inst));
+          break;
+        case GEN_SFID_DATAPORT_DATA:
+          if(UNTYPED_RW_CATEGORY(inst) == 0) {
+            format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
+                   UNTYPED_RW_BTI(inst),
+                   UNTYPED_RW_RGBA(inst),
+                   data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
+                   data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                   data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+          } else {
+            format(file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
+                   SCRATCH_RW_OFFSET(inst),
+                   data_port_scratch_block_size[SCRATCH_RW_BLOCK_SIZE(inst)],
+                   data_port_scratch_invalidate[SCRATCH_RW_INVALIDATE_AFTER_READ(inst)],
+                   data_port_scratch_channel_mode[SCRATCH_RW_CHANNEL_MODE(inst)],
+                   data_port_scratch_msg_type[SCRATCH_RW_MSG_TYPE(inst)]);
+          }
+          break;
+        case GEN_SFID_DATAPORT1_DATA:
           format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
                  UNTYPED_RW_BTI(inst),
                  UNTYPED_RW_RGBA(inst),
                  data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
                  data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
-                 data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
-        } else {
-          format(file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
-                 SCRATCH_RW_OFFSET(inst),
-                 data_port_scratch_block_size[SCRATCH_RW_BLOCK_SIZE(inst)],
-                 data_port_scratch_invalidate[SCRATCH_RW_INVALIDATE_AFTER_READ(inst)],
-                 data_port_scratch_channel_mode[SCRATCH_RW_CHANNEL_MODE(inst)],
-                 data_port_scratch_msg_type[SCRATCH_RW_MSG_TYPE(inst)]);
-        }
-        break;
-      case GEN_SFID_DATAPORT1_DATA:
-        format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
-               UNTYPED_RW_BTI(inst),
-               UNTYPED_RW_RGBA(inst),
-               data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
-               data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
-               data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
-        break;
-      case GEN_SFID_DATAPORT_CONSTANT:
-        format(file, " (bti: %d, %s)",
-               DWORD_RW_BTI(inst),
-               data_port_data_cache_msg_type[DWORD_RW_MSG_TYPE(inst)]);
-        break;
-      case GEN_SFID_MESSAGE_GATEWAY:
-        format(file, " (subfunc: %s, notify: %d, ackreq: %d)",
-               gateway_sub_function[MSG_GW_SUBFUNC(inst)],
-               MSG_GW_NOTIFY(inst),
-               MSG_GW_ACKREQ(inst));
-        break;
-
-      default:
-        format(file, "unsupported target %d", target);
-        break;
+                 data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+          break;
+        case GEN_SFID_DATAPORT_CONSTANT:
+          format(file, " (bti: %d, %s)",
+                 DWORD_RW_BTI(inst),
+                 data_port_data_cache_msg_type[DWORD_RW_MSG_TYPE(inst)]);
+          break;
+        case GEN_SFID_MESSAGE_GATEWAY:
+          format(file, " (subfunc: %s, notify: %d, ackreq: %d)",
+                 gateway_sub_function[MSG_GW_SUBFUNC(inst)],
+                 MSG_GW_NOTIFY(inst),
+                 MSG_GW_ACKREQ(inst));
+          break;
+
+        default:
+          format(file, "unsupported target %d", target);
+          break;
+      }
+      if (space)
+        string(file, " ");
+      format(file, "mlen %d", GENERIC_MSG_LENGTH(inst));
+      format(file, " rlen %d", GENERIC_RESPONSE_LENGTH(inst));
     }
-    if (space)
-      string(file, " ");
-    format(file, "mlen %d", GENERIC_MSG_LENGTH(inst));
-    format(file, " rlen %d", GENERIC_RESPONSE_LENGTH(inst));
   }
   pad(file, 64);
   if (OPCODE(inst) != GEN_OPCODE_NOP) {
diff --git a/backend/src/backend/gen75_encoder.cpp b/backend/src/backend/gen75_encoder.cpp
index c77ce4d..21495af 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -96,8 +96,7 @@ namespace gbe
     gen7_insn->bits3.gen7_typed_rw.slot = 1;
   }
 
-  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
-    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+  unsigned Gen75Encoder::generateAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum) {
     Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
@@ -111,11 +110,6 @@ namespace gbe
     } else
       NOT_IMPLEMENTED;
 
-    this->setHeader(insn);
-    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
-
     const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
     setMessageDescriptor(insn, sfid, msg_length, response_length);
     gen7_insn->bits3.gen7_atomic_op.msg_type = GEN75_P1_UNTYPED_ATOMIC_OP;
@@ -129,11 +123,26 @@ namespace gbe
       gen7_insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
     else
       NOT_SUPPORTED;
+    return gen7_insn->bits3.ud;
   }
 
-  void Gen75Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-    assert(elemNum >= 1 || elemNum <= 4);
+
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      generateAtomicMessageDesc(insn, function, bti.value.ud, srcNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+
+  unsigned Gen75Encoder::generateUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
     if (this->curr.execWidth == 8) {
@@ -144,44 +153,75 @@ namespace gbe
       response_length = 2 * elemNum;
     } else
       NOT_IMPLEMENTED;
-
-    this->setHeader(insn);
-    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
     setDPUntypedRW(insn,
                    bti,
                    untypedRWMask[elemNum],
                    GEN75_P1_UNTYPED_READ,
                    msg_length,
                    response_length);
+    return insn->bits3.ud;
   }
 
-  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+  void Gen75Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
+
+    this->setHeader(insn);
+    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      generateUntypedReadMessageDesc(insn, bti.value.ud, elemNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+
+  unsigned Gen75Encoder::generateUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
-    this->setHeader(insn);
     if (this->curr.execWidth == 8) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
       msg_length = 1 + elemNum;
     } else if (this->curr.execWidth == 16) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
       msg_length = 2 * (1 + elemNum);
     }
     else
       NOT_IMPLEMENTED;
-    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
     setDPUntypedRW(insn,
                    bti,
                    untypedRWMask[elemNum],
                    GEN75_P1_UNTYPED_SURFACE_WRITE,
                    msg_length,
                    response_length);
+    return insn->bits3.ud;
   }
 
+  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+    if (this->curr.execWidth == 8) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+    } else if (this->curr.execWidth == 16) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    }
+    else
+      NOT_IMPLEMENTED;
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      generateUntypedWriteMessageDesc(insn, bti.value.ud, elemNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+
+
   void Gen75Encoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
     union { double d; unsigned u[2]; } u;
     u.d = value;
diff --git a/backend/src/backend/gen75_encoder.hpp b/backend/src/backend/gen75_encoder.hpp
index 9545157..31a199c 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -48,15 +48,18 @@ namespace gbe
     virtual int getDoubleExecWidth(void) { return GEN75_DOUBLE_EXEC_WIDTH; }
     virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
     virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
-    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
-    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
-    virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
+    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
+    virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
     virtual void setHeader(GenNativeInstruction *insn);
     virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
                    uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
     virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
                                       unsigned char msg_type, uint32_t msg_length,
                                       bool header_present);
+    virtual unsigned generateAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
+    virtual unsigned generateUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+    virtual unsigned generateUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
   };
 }
 #endif /* __GBE_GEN75_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 2cdb248..e5f335e 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -800,19 +800,35 @@ namespace gbe
       p->pop();
     }
   }
-
   void Gen8Context::emitRead64Instruction(const SelectionInstruction &insn)
   {
-    const uint32_t bti = insn.getbti();
     const uint32_t elemNum = insn.extra.elem;
     GBE_ASSERT(elemNum == 1);
 
-    const GenRegister addr = ra->genReg(insn.src(0));
-    const GenRegister tmp_dst = ra->genReg(insn.dst(0));
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister bti = ra->genReg(insn.src(1));
 
     /* Because BDW's store and load send instructions for 64 bits require the bti to be surfaceless,
        which we can not accept. We just fallback to 2 DW untyperead here. */
-    p->UNTYPED_READ(tmp_dst, addr, bti, elemNum*2);
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->UNTYPED_READ(dst, src, bti, 2*elemNum);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(2*elemNum));
+      GenNativeInstruction nativeInsn;
+      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
+      unsigned desc = p->generateUntypedReadMessageDesc(&nativeInsn, 0, 2*elemNum);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), 2*elemNum);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
 
     for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
       GenRegister long_tmp = ra->genReg(insn.dst(elemID));
@@ -823,11 +839,10 @@ namespace gbe
 
   void Gen8Context::emitWrite64Instruction(const SelectionInstruction &insn)
   {
-    const uint32_t bti = insn.getbti();
     const uint32_t elemNum = insn.extra.elem;
     GBE_ASSERT(elemNum == 1);
-
     const GenRegister addr = ra->genReg(insn.src(elemNum));
+    const GenRegister bti = ra->genReg(insn.src(elemNum*2+1));
 
     /* Because BDW's store and load send instructions for 64 bits require the bti to be surfaceless,
        which we can not accept. We just fallback to 2 DW untypewrite here. */
@@ -837,9 +852,25 @@ namespace gbe
       this->unpackLongVec(the_long, long_tmp, p->curr.execWidth);
     }
 
-    p->UNTYPED_WRITE(addr, bti, elemNum*2);
-  }
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->UNTYPED_WRITE(addr, bti, elemNum*2);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(elemNum));
+      GenNativeInstruction nativeInsn;
+      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
+      unsigned desc = p->generateUntypedWriteMessageDesc(&nativeInsn, 0, elemNum*2);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
 
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->UNTYPED_WRITE(addr, GenRegister::addr1(0), elemNum*2);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
+  }
   void Gen8Context::emitPackLongInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
     const GenRegister dst = ra->genReg(insn.dst(0));
diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index f02a2ca..ce6150d 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -103,9 +103,7 @@ namespace gbe
   void Gen8Encoder::F32TO16(GenRegister dest, GenRegister src0) {
     MOV(GenRegister::retype(dest, GEN_TYPE_HF), GenRegister::retype(src0, GEN_TYPE_F));
   }
-
-  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
-    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+  unsigned Gen8Encoder::generateAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum) {
     Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
@@ -119,11 +117,6 @@ namespace gbe
     } else
       NOT_IMPLEMENTED;
 
-    this->setHeader(insn);
-    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
-
     const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
     setMessageDescriptor(insn, sfid, msg_length, response_length);
     gen8_insn->bits3.gen7_atomic_op.msg_type = GEN75_P1_UNTYPED_ATOMIC_OP;
@@ -137,11 +130,26 @@ namespace gbe
       gen8_insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
     else
       NOT_SUPPORTED;
+    return gen8_insn->bits3.ud;
   }
 
-  void Gen8Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-    assert(elemNum >= 1 || elemNum <= 4);
+
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      generateAtomicMessageDesc(insn, function, bti.value.ud, srcNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+  unsigned Gen8Encoder::generateUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
     if (this->curr.execWidth == 8) {
@@ -152,44 +160,73 @@ namespace gbe
       response_length = 2 * elemNum;
     } else
       NOT_IMPLEMENTED;
-
-    this->setHeader(insn);
-    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
     setDPUntypedRW(insn,
                    bti,
                    untypedRWMask[elemNum],
                    GEN75_P1_UNTYPED_READ,
                    msg_length,
                    response_length);
+    return insn->bits3.ud;
   }
 
-  void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+  void Gen8Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
+
+    this->setHeader(insn);
+    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      generateUntypedReadMessageDesc(insn, bti.value.ud, elemNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+
+  unsigned Gen8Encoder::generateUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
-    this->setHeader(insn);
     if (this->curr.execWidth == 8) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
       msg_length = 1 + elemNum;
     } else if (this->curr.execWidth == 16) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
       msg_length = 2 * (1 + elemNum);
     }
     else
       NOT_IMPLEMENTED;
-    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
     setDPUntypedRW(insn,
                    bti,
                    untypedRWMask[elemNum],
                    GEN75_P1_UNTYPED_SURFACE_WRITE,
                    msg_length,
                    response_length);
+    return insn->bits3.ud;
   }
 
+  void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+    if (this->curr.execWidth == 8) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+    } else if (this->curr.execWidth == 16) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    }
+    else
+      NOT_IMPLEMENTED;
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      generateUntypedWriteMessageDesc(insn, bti.value.ud, elemNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
   void Gen8Encoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
     union { double d; unsigned u[2]; } u;
     u.d = value;
diff --git a/backend/src/backend/gen8_encoder.hpp b/backend/src/backend/gen8_encoder.hpp
index 4c5e556..37faf25 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -49,9 +49,9 @@ namespace gbe
     virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
     virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
     virtual void LOAD_INT64_IMM(GenRegister dest, GenRegister value);
-    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
-    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
-    virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
+    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
+    virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
     virtual void setHeader(GenNativeInstruction *insn);
     virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
                    uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
@@ -66,6 +66,9 @@ namespace gbe
                        GenRegister src0, GenRegister src1, GenRegister src2);
     virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0,
                             GenRegister src1 = GenRegister::null());
+    virtual unsigned generateAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
+    virtual unsigned generateUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+    virtual unsigned generateUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
   };
 }
 #endif /* __GBE_GEN8_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 62fd596..c389963 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -205,7 +205,8 @@ namespace gbe
       p->curr.execWidth = 1;
       p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
       p->curr.execWidth = this->simdWidth;
-      p->ADD(stackptr, stackptr, bufferptr);
+      // let private address start from zero
+      p->ADD(stackptr, stackptr, GenRegister::immud(0));
       p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
     p->pop();
   }
@@ -1689,9 +1690,27 @@ namespace gbe
     const GenRegister src = ra->genReg(insn.src(0));
     const GenRegister dst = ra->genReg(insn.dst(0));
     const uint32_t function = insn.extra.function;
-    const uint32_t bti = insn.getbti();
+    unsigned srcNum = insn.extra.elem;
+
+    const GenRegister bti = ra->genReg(insn.src(srcNum));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->ATOMIC(dst, function, src, bti, srcNum);
+    } else {
+      GenRegister flagTemp = ra->genReg(insn.dst(1));
+
+      GenNativeInstruction nativeInsn;
+      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
+      unsigned desc = p->generateAtomicMessageDesc(&nativeInsn, function, 0, srcNum);
 
-    p->ATOMIC(dst, function, src, bti, insn.srcNum);
+      unsigned jip0 = beforeMessage(insn, bti, flagTemp, desc);
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->ATOMIC(dst, function, src, GenRegister::addr1(0), srcNum);
+      p->pop();
+      afterMessage(insn, bti, flagTemp, jip0);
+    }
   }
 
   void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
@@ -1811,48 +1830,200 @@ namespace gbe
   }
 
   void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
-    const uint32_t elemNum = insn.extra.elem;
+    const uint32_t elemNum = insn.extra.elem * 2;
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.getbti();
-    p->UNTYPED_READ(dst, src, bti, elemNum*2);
+    const GenRegister bti = ra->genReg(insn.src(1));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->UNTYPED_READ(dst, src, bti, elemNum);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(elemNum));
+      GenNativeInstruction nativeInsn;
+      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
+      unsigned desc = p->generateUntypedReadMessageDesc(&nativeInsn, 0, elemNum);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), elemNum);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
+  }
+  unsigned GenContext::beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister tmp, unsigned desc) {
+      const GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+      setFlag(flagReg, GenRegister::immuw(0));
+      p->CMP(GEN_CONDITIONAL_NZ, flagReg, GenRegister::immuw(1));
+
+      GenRegister btiUD = ra->genReg(GenRegister::ud1grf(ir::ocl::btiUtil));
+      GenRegister btiUW = ra->genReg(GenRegister::uw1grf(ir::ocl::btiUtil));
+      GenRegister btiUB = ra->genReg(GenRegister::ub1grf(ir::ocl::btiUtil));
+      unsigned jip0 = p->n_instruction();
+      p->push();
+        p->curr.execWidth = 1;
+        p->curr.noMask = 1;
+        p->AND(btiUD, flagReg, GenRegister::immud(0xffffffff));
+        p->LZD(btiUD, btiUD);
+        p->ADD(btiUW, GenRegister::negate(btiUW), GenRegister::immuw(0x1f));
+        p->MUL(btiUW, btiUW, GenRegister::immuw(0x4));
+        p->ADD(GenRegister::addr1(0), btiUW, GenRegister::immud(bti.nr*32));
+        p->MOV(btiUD, GenRegister::indirect(GEN_TYPE_UD, 0, GEN_WIDTH_1));
+        //save flag
+        p->MOV(tmp, flagReg);
+      p->pop();
+
+      p->CMP(GEN_CONDITIONAL_Z, bti, btiUD);
+      p->push();
+        p->curr.execWidth = 1;
+        p->curr.noMask = 1;
+        p->OR(GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), btiUB, GenRegister::immud(desc));
+      p->pop();
+      return jip0;
+  }
+  void GenContext::afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister tmp, unsigned jip0) {
+    const GenRegister btiUD = ra->genReg(GenRegister::ud1grf(ir::ocl::btiUtil));
+      //restore flag
+      setFlag(GenRegister::flag(insn.state.flag, insn.state.subFlag), tmp);
+      // get active channel
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->CMP(GEN_CONDITIONAL_NZ, bti, btiUD);
+        unsigned jip1 = p->n_instruction();
+        p->WHILE(GenRegister::immud(0));
+      p->pop();
+      p->patchJMPI(jip1, jip0 - jip1, 0);
   }
 
   void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.getbti();
+    const GenRegister bti = ra->genReg(insn.src(1));
+
     const uint32_t elemNum = insn.extra.elem;
-    p->UNTYPED_READ(dst, src, bti, elemNum);
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->UNTYPED_READ(dst, src, bti, elemNum);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(elemNum));
+      GenNativeInstruction nativeInsn;
+      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
+      unsigned desc = p->generateUntypedReadMessageDesc(&nativeInsn, 0, elemNum);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), elemNum);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
   }
 
   void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.dst(0));
     const uint32_t elemNum = insn.extra.elem;
-    const uint32_t bti = insn.getbti();
-    p->UNTYPED_WRITE(src, bti, elemNum*2);
+    const GenRegister bti = ra->genReg(insn.src(elemNum+1));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->UNTYPED_WRITE(src, bti, elemNum*2);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(0));
+      GenNativeInstruction nativeInsn;
+      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
+      unsigned desc = p->generateUntypedWriteMessageDesc(&nativeInsn, 0, elemNum*2);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->UNTYPED_WRITE(src, GenRegister::addr1(0), elemNum*2);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
   }
 
   void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.getbti();
     const uint32_t elemNum = insn.extra.elem;
-    p->UNTYPED_WRITE(src, bti, elemNum);
+    const GenRegister bti = ra->genReg(insn.src(elemNum+1));
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->UNTYPED_WRITE(src, bti, elemNum);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(0));
+      GenNativeInstruction nativeInsn;
+      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
+      unsigned desc = p->generateUntypedWriteMessageDesc(&nativeInsn, 0, elemNum);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->UNTYPED_WRITE(src, GenRegister::addr1(0), elemNum);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
   }
 
   void GenContext::emitByteGatherInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.getbti();
+    const GenRegister bti = ra->genReg(insn.src(1));
     const uint32_t elemSize = insn.extra.elem;
-    p->BYTE_GATHER(dst, src, bti, elemSize);
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->BYTE_GATHER(dst, src, bti, elemSize);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(1));
+      GenNativeInstruction nativeInsn;
+      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
+      unsigned desc = p->generateByteGatherMessageDesc(&nativeInsn, 0, elemSize);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->BYTE_GATHER(dst, src, GenRegister::addr1(0), elemSize);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
   }
 
   void GenContext::emitByteScatterInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.getbti();
     const uint32_t elemSize = insn.extra.elem;
-    p->BYTE_SCATTER(src, bti, elemSize);
+    const GenRegister bti = ra->genReg(insn.src(2));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->BYTE_SCATTER(src, bti, elemSize);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(0));
+      GenNativeInstruction nativeInsn;
+      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
+      unsigned desc = p->generateByteScatterMessageDesc(&nativeInsn, 0, elemSize);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->BYTE_SCATTER(src, GenRegister::addr1(0), elemSize);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
+
   }
 
   void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) {
@@ -1988,6 +2159,7 @@ namespace gbe
     allocCurbeReg(lid2, GBE_CURBE_LOCAL_ID_Z);
     allocCurbeReg(zero, GBE_CURBE_ZERO);
     allocCurbeReg(one, GBE_CURBE_ONE);
+    allocCurbeReg(btiUtil, GBE_CURBE_BTI_UTIL);
     if (stackUse.size() != 0)
       allocCurbeReg(stackbuffer, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
     // Go over the arguments and find the related patch locations
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 560248a..a85657c 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -169,6 +169,8 @@ namespace gbe
     virtual void emitI64DIVREMInstruction(const SelectionInstruction &insn);
     void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
     void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
+    unsigned beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, unsigned desc);
+    void afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, unsigned jip0);
 
     /*! Implements base class */
     virtual Kernel *allocateKernel(void);
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 5aa8c5c..7f2d464 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -329,10 +329,7 @@ namespace gbe
     GEN_UNTYPED_ALPHA,
     0
   };
-
-  void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
-    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-    assert(elemNum >= 1 || elemNum <= 4);
+  unsigned GenEncoder::generateUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
     if (this->curr.execWidth == 8) {
@@ -340,49 +337,75 @@ namespace gbe
       response_length = elemNum;
     } else if (this->curr.execWidth == 16) {
       msg_length = 2;
-      response_length = 2*elemNum;
+      response_length = 2 * elemNum;
     } else
       NOT_IMPLEMENTED;
-
-    this->setHeader(insn);
-    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
     setDPUntypedRW(insn,
                    bti,
                    untypedRWMask[elemNum],
                    GEN7_UNTYPED_READ,
                    msg_length,
                    response_length);
+    return insn->bits3.ud;
   }
 
-  void GenEncoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+  void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
+
+    this->setHeader(insn);
+    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      generateUntypedReadMessageDesc(insn, bti.value.ud, elemNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+
+  unsigned GenEncoder::generateUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
-    this->setHeader(insn);
     if (this->curr.execWidth == 8) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
-      msg_length = 1+elemNum;
+      msg_length = 1 + elemNum;
     } else if (this->curr.execWidth == 16) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
-      msg_length = 2*(1+elemNum);
+      msg_length = 2 * (1 + elemNum);
     }
     else
       NOT_IMPLEMENTED;
-    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
     setDPUntypedRW(insn,
                    bti,
                    untypedRWMask[elemNum],
                    GEN7_UNTYPED_WRITE,
                    msg_length,
                    response_length);
+    return insn->bits3.ud;
   }
 
-  void GenEncoder::BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize) {
+  void GenEncoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    this->setHeader(insn);
+    if (this->curr.execWidth == 8) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+    } else if (this->curr.execWidth == 16) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    }
+    else
+      NOT_IMPLEMENTED;
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      generateUntypedWriteMessageDesc(insn, bti.value.ud, elemNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+  unsigned GenEncoder::generateByteGatherMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
     if (this->curr.execWidth == 8) {
@@ -393,11 +416,6 @@ namespace gbe
       response_length = 2;
     } else
       NOT_IMPLEMENTED;
-
-    this->setHeader(insn);
-    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
     setDPByteScatterGather(this,
                            insn,
                            bti,
@@ -405,23 +423,35 @@ namespace gbe
                            GEN7_BYTE_GATHER,
                            msg_length,
                            response_length);
+    return insn->bits3.ud;
+
   }
 
-  void GenEncoder::BYTE_SCATTER(GenRegister msg, uint32_t bti, uint32_t elemSize) {
+  void GenEncoder::BYTE_GATHER(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemSize) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      generateByteGatherMessageDesc(insn, bti.value.ud, elemSize);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+  unsigned GenEncoder::generateByteScatterMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
-    this->setHeader(insn);
     if (this->curr.execWidth == 8) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
       msg_length = 2;
     } else if (this->curr.execWidth == 16) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
       msg_length = 4;
     } else
       NOT_IMPLEMENTED;
-    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
+
     setDPByteScatterGather(this,
                            insn,
                            bti,
@@ -429,6 +459,30 @@ namespace gbe
                            GEN7_BYTE_SCATTER,
                            msg_length,
                            response_length);
+    return insn->bits3.ud;
+  }
+
+  void GenEncoder::BYTE_SCATTER(GenRegister msg, GenRegister bti, uint32_t elemSize) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+    if (this->curr.execWidth == 8) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+    } else if (this->curr.execWidth == 16) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    } else
+      NOT_IMPLEMENTED;
+
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      generateByteScatterMessageDesc(insn, bti.value.ud, elemSize);
+    } else {
+      this->setSrc1(insn, bti);
+    }
   }
 
   void GenEncoder::DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti) {
@@ -460,9 +514,7 @@ namespace gbe
                            response_length);
 
   }
-
-  void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
-    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+  unsigned GenEncoder::generateAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
 
@@ -470,16 +522,11 @@ namespace gbe
       msg_length = srcNum;
       response_length = 1;
     } else if (this->curr.execWidth == 16) {
-      msg_length = 2*srcNum;
+      msg_length = 2 * srcNum;
       response_length = 2;
     } else
       NOT_IMPLEMENTED;
 
-    this->setHeader(insn);
-    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
-
     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
     setMessageDescriptor(insn, sfid, msg_length, response_length);
     insn->bits3.gen7_atomic_op.msg_type = GEN7_UNTYPED_ATOMIC_READ;
@@ -493,7 +540,23 @@ namespace gbe
       insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
     else
       NOT_SUPPORTED;
+    return insn->bits3.ud;
+  }
 
+  void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      generateAtomicMessageDesc(insn, function, bti.value.ud, srcNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
   }
   GenCompactInstruction *GenEncoder::nextCompact(uint32_t opcode) {
     GenCompactInstruction insn;
@@ -893,6 +956,8 @@ namespace gbe
   ALU2_BRA(BRD)
   ALU2_BRA(BRC)
 
+  // jip is the distance between jump instruction and jump-target. we have handled
+  // pre/post-increment in patchJMPI() function body
   void GenEncoder::patchJMPI(uint32_t insnID, int32_t jip, int32_t uip) {
     GenNativeInstruction &insn = *(GenNativeInstruction *)&this->store[insnID];
     GBE_ASSERT(insnID < this->store.size());
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 21faabc..f016e91 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -169,15 +169,15 @@ namespace gbe
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
     /*! Atomic instructions */
-    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
+    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
     /*! Untyped read (upto 4 channels) */
-    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
     /*! Untyped write (upto 4 channels) */
-    virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+    virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
     /*! Byte gather (for unaligned bytes, shorts and ints) */
-    void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize);
+    void BYTE_GATHER(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemSize);
     /*! Byte scatter (for unaligned bytes, shorts and ints) */
-    void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
+    void BYTE_SCATTER(GenRegister src, GenRegister bti, uint32_t elemSize);
     /*! DWord gather (for constant cache read) */
     void DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti);
     /*! for scratch memory read */
@@ -230,6 +230,12 @@ namespace gbe
     void setMessageDescriptor(GenNativeInstruction *inst, enum GenMessageTarget sfid,
                               unsigned msg_length, unsigned response_length,
                               bool header_present = false, bool end_of_thread = false);
+    virtual unsigned generateAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
+    virtual unsigned generateUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+    virtual unsigned generateUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+    unsigned generateByteGatherMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
+    unsigned generateByteScatterMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
+
     virtual void setHeader(GenNativeInstruction *insn) = 0;
     virtual void setDst(GenNativeInstruction *insn, GenRegister dest) = 0;
     virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg) = 0;
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 19a3c24..367dcdb 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -595,19 +595,19 @@ namespace gbe
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
     /*! Atomic instruction */
-    void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
+    void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, GenRegister bti, GenRegister *flagTemp);
     /*! Read 64 bits float/int array */
-    void READ64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum, uint32_t bti, bool native_long);
+    void READ64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum, const GenRegister bti, bool native_long, GenRegister *flagTemp);
     /*! Write 64 bits float/int array */
-    void WRITE64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum, uint32_t bti, bool native_long);
+    void WRITE64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum, GenRegister bti, bool native_long, GenRegister *flagTemp);
     /*! Untyped read (up to 4 elements) */
-    void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+    void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, GenRegister bti, GenRegister *flagTemp);
     /*! Untyped write (up to 4 elements) */
-    void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
+    void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, GenRegister bti, GenRegister *flagTemp);
     /*! Byte gather (for unaligned bytes, shorts and ints) */
-    void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti);
+    void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp);
     /*! Byte scatter (for unaligned bytes, shorts and ints) */
-    void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti);
+    void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp);
     /*! DWord scatter (for constant cache read) */
     void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
     /*! Unpack the uint to charN */
@@ -1197,16 +1197,26 @@ namespace gbe
 
   void Selection::Opaque::ATOMIC(Reg dst, uint32_t function,
                                      uint32_t srcNum, Reg src0,
-                                     Reg src1, Reg src2, uint32_t bti) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, 1, srcNum);
+                                     Reg src1, Reg src2, GenRegister bti, GenRegister *flagTemp) {
+    unsigned dstNum = flagTemp == NULL ? 1 : 2;
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, dstNum, srcNum + 1);
+
+    if (bti.file != GEN_IMMEDIATE_VALUE) {
+      insn->state.flag = 0;
+      insn->state.subFlag = 1;
+    }
+
     insn->dst(0) = dst;
+    if(flagTemp) insn->dst(1) = *flagTemp;
+
     insn->src(0) = src0;
     if(srcNum > 1) insn->src(1) = src1;
     if(srcNum > 2) insn->src(2) = src2;
+    insn->src(srcNum) = bti;
     insn->extra.function = function;
-    insn->setbti(bti);
-    SelectionVector *vector = this->appendVector();
+    insn->extra.elem = srcNum;
 
+    SelectionVector *vector = this->appendVector();
     vector->regNum = srcNum;
     vector->reg = &insn->src(0);
     vector->isSrc = 1;
@@ -1220,22 +1230,29 @@ namespace gbe
                                  const GenRegister *dst,
                                  const GenRegister *tmp,
                                  uint32_t elemNum,
-                                 uint32_t bti,
-                                 bool native_long)
+                                 const GenRegister bti,
+                                 bool native_long,
+                                 GenRegister *flagTemp)
   {
     SelectionInstruction *insn = NULL;
     SelectionVector *srcVector = NULL;
     SelectionVector *dstVector = NULL;
 
     if (!native_long) {
-      insn = this->appendInsn(SEL_OP_READ64, elemNum, 1);
+      unsigned dstNum = flagTemp == NULL ? elemNum : elemNum+1;
+      insn = this->appendInsn(SEL_OP_READ64, dstNum, 2);
       srcVector = this->appendVector();
       dstVector = this->appendVector();
       // Regular instruction to encode
       for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
         insn->dst(elemID) = dst[elemID];
+
+      // flagTemp don't need to be put in SelectionVector
+      if (flagTemp)
+        insn->dst(elemNum) = *flagTemp;
     } else {
-      insn = this->appendInsn(SEL_OP_READ64, elemNum*2, 1);
+      unsigned dstNum = flagTemp == NULL ? elemNum*2 : elemNum*2+1;
+      insn = this->appendInsn(SEL_OP_READ64, dstNum, 2);
       srcVector = this->appendVector();
       dstVector = this->appendVector();
 
@@ -1244,10 +1261,20 @@ namespace gbe
 
       for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
         insn->dst(elemID + elemNum) = dst[elemID];
+
+      // flagTemp don't need to be put in SelectionVector
+      if (flagTemp)
+        insn->dst(2*elemNum) = *flagTemp;
+    }
+
+    if (bti.file != GEN_IMMEDIATE_VALUE) {
+      insn->state.flag = 0;
+      insn->state.subFlag = 1;
     }
 
     insn->src(0) = addr;
-    insn->setbti(bti);
+    insn->src(1) = bti;
+
     insn->extra.elem = elemNum;
 
     dstVector->regNum = elemNum;
@@ -1262,9 +1289,11 @@ namespace gbe
   void Selection::Opaque::UNTYPED_READ(Reg addr,
                                        const GenRegister *dst,
                                        uint32_t elemNum,
-                                       uint32_t bti)
+                                       GenRegister bti,
+                                       GenRegister *flagTemp)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, elemNum, 1);
+    unsigned dstNum = flagTemp == NULL ? elemNum : elemNum+1;
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, dstNum, 2);
     SelectionVector *srcVector = this->appendVector();
     SelectionVector *dstVector = this->appendVector();
     if (this->isScalarReg(dst[0].reg()))
@@ -1272,8 +1301,16 @@ namespace gbe
     // Regular instruction to encode
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
       insn->dst(elemID) = dst[elemID];
+    if (flagTemp)
+      insn->dst(elemNum) = *flagTemp;
+
     insn->src(0) = addr;
-    insn->setbti(bti);
+    insn->src(1) = bti;
+    if (bti.file != GEN_IMMEDIATE_VALUE) {
+      insn->state.flag = 0;
+      insn->state.subFlag = 1;
+    }
+
     insn->extra.elem = elemNum;
 
     // Sends require contiguous allocation
@@ -1290,31 +1327,40 @@ namespace gbe
                                   const GenRegister *src,
                                   const GenRegister *tmp,
                                   uint32_t srcNum,
-                                  uint32_t bti,
-                                  bool native_long)
+                                  GenRegister bti,
+                                  bool native_long,
+                                  GenRegister *flagTemp)
   {
     SelectionVector *vector = NULL;
     SelectionInstruction *insn = NULL;
 
     if (!native_long) {
-      insn = this->appendInsn(SEL_OP_WRITE64, 0, srcNum + 1);
+      unsigned dstNum = flagTemp == NULL ? 0 : 1;
+      insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum + 2);
       vector = this->appendVector();
-      // Regular instruction to encode
+      // Register layout:
+      // dst: (flagTemp)
+      // src: addr, srcNum, bti
       insn->src(0) = addr;
       for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
         insn->src(elemID + 1) = src[elemID];
 
-      insn->setbti(bti);
+      insn->src(srcNum+1) = bti;
+      if (flagTemp)
+        insn->dst(0) = *flagTemp;
       insn->extra.elem = srcNum;
 
       vector->regNum = srcNum + 1;
       vector->reg = &insn->src(0);
       vector->isSrc = 1;
     } else { // handle the native long case
-      insn = this->appendInsn(SEL_OP_WRITE64, srcNum, srcNum*2 + 1);
+      unsigned dstNum = flagTemp == NULL ? srcNum : srcNum+1;
+      // Register layout:
+      // dst: srcNum, (flagTemp)
+      // src: srcNum, addr, srcNum, bti.
+      insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum*2 + 2);
       vector = this->appendVector();
 
-      insn->src(0) = addr;
       for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
         insn->src(elemID) = src[elemID];
 
@@ -1322,33 +1368,50 @@ namespace gbe
       for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
         insn->src(srcNum + 1 + elemID) = tmp[0];
 
+      insn->src(srcNum*2+1) = bti;
       /* We also need to add the tmp reigster to dst, in order
          to avoid the post schedule error . */
       for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
         insn->dst(elemID) = tmp[0];
 
-      insn->setbti(bti);
+      if (flagTemp)
+        insn->dst(srcNum) = *flagTemp;
       insn->extra.elem = srcNum;
 
       vector->regNum = srcNum + 1;
       vector->reg = &insn->src(srcNum);
       vector->isSrc = 1;
     }
+
+    if (bti.file != GEN_IMMEDIATE_VALUE) {
+      insn->state.flag = 0;
+      insn->state.subFlag = 1;
+    }
   }
 
   void Selection::Opaque::UNTYPED_WRITE(Reg addr,
                                         const GenRegister *src,
                                         uint32_t elemNum,
-                                        uint32_t bti)
+                                        GenRegister bti,
+                                        GenRegister *flagTemp)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, 0, elemNum+1);
+    unsigned dstNum = flagTemp == NULL ? 0 : 1;
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, dstNum, elemNum+2);
     SelectionVector *vector = this->appendVector();
 
+    if (bti.file != GEN_IMMEDIATE_VALUE) {
+      insn->state.flag = 0;
+      insn->state.subFlag = 1;
+    }
+
+    if (flagTemp) insn->dst(0) = *flagTemp;
     // Regular instruction to encode
     insn->src(0) = addr;
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
       insn->src(elemID+1) = src[elemID];
-    insn->setbti(bti);
+    insn->src(elemNum+1) = bti;
+    if (flagTemp)
+      insn->src(elemNum+2) = *flagTemp;
     insn->extra.elem = elemNum;
 
     // Sends require contiguous allocation for the sources
@@ -1357,17 +1420,26 @@ namespace gbe
     vector->isSrc = 1;
   }
 
-  void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHER, 1, 1);
+  void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp) {
+    unsigned dstNum = flagTemp == NULL ? 1 : 2;
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHER, dstNum, 2);
     SelectionVector *srcVector = this->appendVector();
     SelectionVector *dstVector = this->appendVector();
 
+    if (bti.file != GEN_IMMEDIATE_VALUE) {
+      insn->state.flag = 0;
+      insn->state.subFlag = 1;
+    }
+
     if (this->isScalarReg(dst.reg()))
       insn->state.noMask = 1;
     // Instruction to encode
     insn->src(0) = addr;
+    insn->src(1) = bti;
     insn->dst(0) = dst;
-    insn->setbti(bti);
+    if (flagTemp)
+      insn->dst(1) = *flagTemp;
+
     insn->extra.elem = elemSize;
 
     // byte gather requires vector in the sense that scalar are not allowed
@@ -1380,14 +1452,22 @@ namespace gbe
     srcVector->reg = &insn->src(0);
   }
 
-  void Selection::Opaque::BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, 0, 2);
+  void Selection::Opaque::BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp) {
+    unsigned dstNum = flagTemp == NULL ? 0 : 1;
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, dstNum, 3);
     SelectionVector *vector = this->appendVector();
 
+    if (bti.file != GEN_IMMEDIATE_VALUE) {
+      insn->state.flag = 0;
+      insn->state.subFlag = 1;
+    }
+
+    if (flagTemp)
+      insn->dst(0) = *flagTemp;
     // Instruction to encode
     insn->src(0) = addr;
     insn->src(1) = src;
-    insn->setbti(bti);
+    insn->src(2) = bti;
     insn->extra.elem = elemSize;
 
     // value and address are contiguous in the send
@@ -3096,34 +3176,31 @@ namespace gbe
     }
   }
 
-  /*! Load instruction pattern */
-  DECL_PATTERN(LoadInstruction)
+  static void markAllChildrenExceptBTI(SelectionDAG &dag) {
+    // Do not merge anything, so all sources become roots
+    for (uint32_t childID = 1; childID < dag.childNum; ++childID)
+      if (dag.child[childID])
+        dag.child[childID]->isRoot = 1;
+  }
+
+  class LoadInstructionPattern : public SelectionPattern
   {
+  public:
+    /*! Register the pattern for all opcodes of the family */
+    LoadInstructionPattern(void) : SelectionPattern(1, 1) {
+       this->opcodes.push_back(ir::OP_LOAD);
+    }
     void readDWord(Selection::Opaque &sel,
                    vector<GenRegister> &dst,
-                   vector<GenRegister> &dst2,
                    GenRegister addr,
                    uint32_t valueNum,
                    ir::BTI bti) const
     {
-      for (uint32_t x = 0; x < bti.count; x++) {
-        if(x > 0)
-          for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
-            dst2[dstID] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
-
-        GenRegister temp = getRelativeAddress(sel, addr, bti.bti[x]);
-        sel.UNTYPED_READ(temp, dst2.data(), valueNum, bti.bti[x]);
-        if(x > 0) {
-          sel.push();
-            if(sel.isScalarReg(dst[0].reg())) {
-              sel.curr.noMask = 1;
-              sel.curr.execWidth = 1;
-            }
-            for (uint32_t y = 0; y < valueNum; y++)
-              sel.ADD(dst[y], dst[y], dst2[y]);
-          sel.pop();
-        }
-      }
+        //GenRegister temp = getRelativeAddress(sel, addr, sel.selReg(bti.base, ir::TYPE_U32));
+
+        GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+        GenRegister tmp = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
+        sel.UNTYPED_READ(addr, dst.data(), valueNum, b, bti.isConst ? NULL : &tmp);
     }
 
     void emitUntypedRead(Selection::Opaque &sel,
@@ -3134,10 +3211,9 @@ namespace gbe
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       vector<GenRegister> dst(valueNum);
-      vector<GenRegister> dst2(valueNum);
       for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
-        dst2[dstID] = dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
-      readDWord(sel, dst, dst2, addr, valueNum, bti);
+        dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
+      readDWord(sel, dst, addr, valueNum, bti);
     }
 
     void emitDWordGather(Selection::Opaque &sel,
@@ -3146,15 +3222,15 @@ namespace gbe
                          ir::BTI bti) const
     {
       using namespace ir;
-      GBE_ASSERT(bti.count == 1);
-      const uint32_t isUniform = sel.isScalarReg(insn.getValue(0));
+      GBE_ASSERT(bti.isConst == 1);
       GBE_ASSERT(insn.getValueNum() == 1);
+      const uint32_t isUniform = sel.isScalarReg(insn.getValue(0));
 
       if(isUniform) {
         GenRegister dst = sel.selReg(insn.getValue(0), ir::TYPE_U32);
         sel.push();
           sel.curr.noMask = 1;
-          sel.SAMPLE(&dst, 1, &addr, 1, bti.bti[0], 0, true, true);
+          sel.SAMPLE(&dst, 1, &addr, 1, bti.imm, 0, true, true);
         sel.pop();
         return;
       }
@@ -3170,7 +3246,7 @@ namespace gbe
         sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
       sel.pop();
 
-      sel.DWORD_GATHER(dst, addrDW, bti.bti[0]);
+      sel.DWORD_GATHER(dst, addrDW, bti.imm);
     }
 
     void emitRead64(Selection::Opaque &sel,
@@ -3182,9 +3258,10 @@ namespace gbe
       const uint32_t valueNum = insn.getValueNum();
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
-      GBE_ASSERT(bti.count == 1);
+      GBE_ASSERT(bti.isConst == 1);
       vector<GenRegister> dst(valueNum);
-      GenRegister tmpAddr = getRelativeAddress(sel, addr, bti.bti[0]);
+      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+      GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
       for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
         dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
 
@@ -3194,9 +3271,9 @@ namespace gbe
           tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
         }
 
-        sel.READ64(tmpAddr, dst.data(), tmp.data(), valueNum, bti.bti[0], true);
+        sel.READ64(addr, dst.data(), tmp.data(), valueNum, b, true, bti.isConst ? NULL : &tmpFlag);
       } else {
-        sel.READ64(tmpAddr, dst.data(), NULL, valueNum, bti.bti[0], false);
+        sel.READ64(addr, dst.data(), NULL, valueNum, b, false, bti.isConst ? NULL : &tmpFlag);
       }
     }
 
@@ -3205,12 +3282,16 @@ namespace gbe
                         GenRegister address,
                         GenRegister dst,
                         bool isUniform,
-                        uint8_t bti) const
+                        ir::BTI bti) const
     {
       using namespace ir;
         Register tmpReg = sel.reg(FAMILY_DWORD, isUniform);
         GenRegister tmpAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
         GenRegister tmpData = sel.selReg(tmpReg, ir::TYPE_U32);
+
+        GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+        GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
+
         // Get dword aligned addr
         sel.push();
           if (isUniform) {
@@ -3222,7 +3303,7 @@ namespace gbe
         sel.push();
           if (isUniform)
             sel.curr.noMask = 1;
-          sel.UNTYPED_READ(tmpAddr, &tmpData, 1, bti);
+          sel.UNTYPED_READ(tmpAddr, &tmpData, 1, b, bti.isConst ? NULL : &tmpFlag);
 
           if (isUniform)
             sel.curr.execWidth = 1;
@@ -3258,14 +3339,11 @@ namespace gbe
 
       uint32_t tmpRegNum = (typeSize*valueNum + 3) / 4;
       vector<GenRegister> tmp(tmpRegNum);
-      vector<GenRegister> tmp2(tmpRegNum);
-      vector<Register> tmpReg(tmpRegNum);
       for(uint32_t i = 0; i < tmpRegNum; i++) {
-        tmpReg[i] = sel.reg(FAMILY_DWORD, isUniform);
-        tmp2[i] = tmp[i] = sel.selReg(tmpReg[i], ir::TYPE_U32);
+        tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
       }
 
-      readDWord(sel, tmp, tmp2, address, tmpRegNum, bti);
+      readDWord(sel, tmp, address, tmpRegNum, bti);
 
       for(uint32_t i = 0; i < tmpRegNum; i++) {
         unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
@@ -3370,7 +3448,7 @@ namespace gbe
               sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
             sel.pop();
           }
-          readDWord(sel, t1, t2, alignedAddr, width, bti);
+          readDWord(sel, t1, alignedAddr, width, bti);
           remainedReg -= width;
           pos += width;
         } while(remainedReg);
@@ -3389,24 +3467,8 @@ namespace gbe
         GBE_ASSERT(insn.getValueNum() == 1);
         const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType());
         GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
-        GenRegister tmp = value;
 
-        for (int x = 0; x < bti.count; x++) {
-          if (x > 0)
-            tmp = sel.selReg(sel.reg(family, isUniform), insn.getValueType());
-
-          GenRegister addr = getRelativeAddress(sel, address, bti.bti[x]);
-          readByteAsDWord(sel, elemSize, addr, tmp, isUniform, bti.bti[x]);
-          if (x > 0) {
-            sel.push();
-              if (isUniform) {
-                sel.curr.noMask = 1;
-                sel.curr.execWidth = 1;
-              }
-              sel.ADD(value, value, tmp);
-            sel.pop();
-          }
-        }
+        readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
       }
     }
 
@@ -3422,30 +3484,33 @@ namespace gbe
       sel.INDIRECT_MOVE(dst, src);
     }
 
-    INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, uint8_t bti) const {
-      if (bti == 0xfe || bti == BTI_CONSTANT)
-        return address;
-
-      sel.push();
-        sel.curr.noMask = 1;
-        if (GenRegister::hstride_size(address) == 0)
-          sel.curr.execWidth = 1;
-        GenRegister temp = sel.selReg(sel.reg(ir::FAMILY_DWORD, sel.curr.execWidth == 1), ir::TYPE_U32);
-        sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti), ir::TYPE_U32)));
-      sel.pop();
-      return temp;
-    }
-    // check whether all binded table index point to constant memory
     INLINE bool isAllConstant(const ir::BTI &bti) const {
-      for (int x = 0; x < bti.count; x++) {
-         if (bti.bti[x] != BTI_CONSTANT)
-           return false;
+      if (bti.isConst && bti.imm == BTI_CONSTANT)
+        return true;
+      return false;
+    }
+
+    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::LoadInstruction &insn) const {
+      using namespace ir;
+      SelectionDAG *child0 = dag.child[0];
+      ir::BTI b;
+      if (insn.isFixedBTI()) {
+        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
+        const auto imm = immInsn.getImmediate();
+        b.isConst = 1;
+        b.imm = imm.getIntegerValue();
+      } else {
+        b.isConst = 0;
+        b.reg = insn.getBTI();
       }
-      return true;
+      return b;
     }
 
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn, bool &markChildren) const {
+    /*! Implements base class */
+    virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
+    {
       using namespace ir;
+      const ir::LoadInstruction &insn = cast<ir::LoadInstruction>(dag.insn);
       GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
       GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
                  insn.getAddressSpace() == MEM_CONSTANT ||
@@ -3453,9 +3518,11 @@ namespace gbe
                  insn.getAddressSpace() == MEM_LOCAL ||
                  insn.getAddressSpace() == MEM_MIXED);
       //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
+
+      BTI bti = getBTI(dag, insn);
+
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
-      const BTI &bti = insn.getBTI();
       bool allConstant = isAllConstant(bti);
 
       if (allConstant) {
@@ -3480,65 +3547,78 @@ namespace gbe
         else
           this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
       }
+
+      if (insn.isFixedBTI()) {
+        markAllChildrenExceptBTI(dag);
+      } else {
+        markAllChildren(dag);
+      }
       return true;
     }
-    DECL_CTOR(LoadInstruction, 1, 1);
   };
-
-  /*! Store instruction pattern */
-  DECL_PATTERN(StoreInstruction)
+  class StoreInstructionPattern : public SelectionPattern
   {
+  public:
+    /*! Register the pattern for all opcodes of the family */
+    StoreInstructionPattern(void) : SelectionPattern(1, 1) {
+       this->opcodes.push_back(ir::OP_STORE);
+    }
     void emitUntypedWrite(Selection::Opaque &sel,
                           const ir::StoreInstruction &insn,
-                          GenRegister addr,
-                          uint32_t bti) const
+                          GenRegister address,
+                          ir::BTI &bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       vector<GenRegister> value(valueNum);
+      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
 
-      addr = GenRegister::retype(addr, GEN_TYPE_F);
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
-        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
-      sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
+        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_UD);
+      GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
+      sel.UNTYPED_WRITE(address, value.data(), valueNum, b, bti.isConst? NULL : &tmp);
     }
 
     void emitWrite64(Selection::Opaque &sel,
                      const ir::StoreInstruction &insn,
-                     GenRegister addr,
-                     uint32_t bti) const
+                     GenRegister address,
+                     ir::BTI &bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
-      addr = GenRegister::retype(addr, GEN_TYPE_UD);
+      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
       vector<GenRegister> src(valueNum);
 
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
         src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
 
+      GenRegister tmpFlag = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
+
       if (sel.hasLongType()) {
         vector<GenRegister> tmp(valueNum);
         for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
           tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
         }
-        sel.WRITE64(addr, src.data(), tmp.data(), valueNum, bti, true);
+        sel.WRITE64(address, src.data(), tmp.data(), valueNum, b, true, bti.isConst? NULL : &tmpFlag);
       } else {
-        sel.WRITE64(addr, src.data(), NULL, valueNum, bti, false);
+        sel.WRITE64(address, src.data(), NULL, valueNum, b, false, bti.isConst? NULL : &tmpFlag);
       }
     }
 
     void emitByteScatter(Selection::Opaque &sel,
                          const ir::StoreInstruction &insn,
                          const uint32_t elemSize,
-                         GenRegister addr,
-                         uint32_t bti,
+                         GenRegister address,
+                         ir::BTI &bti,
                          bool isUniform) const
     {
       using namespace ir;
       uint32_t valueNum = insn.getValueNum();
 
+      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+      GenRegister tmpFlag = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
       if(valueNum > 1) {
         const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
         vector<GenRegister> value(valueNum);
@@ -3558,11 +3638,12 @@ namespace gbe
           sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, typeSize, 4/typeSize);
         }
 
-        sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti);
+        sel.UNTYPED_WRITE(address, tmp.data(), tmpRegNum, b, bti.isConst ? NULL : &tmpFlag);
       } else {
         const GenRegister value = sel.selReg(insn.getValue(0));
         GBE_ASSERT(insn.getValueNum() == 1);
         const GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+
         sel.push();
           if (isUniform) {
             sel.curr.noMask = 1;
@@ -3574,47 +3655,52 @@ namespace gbe
           else if (elemSize == GEN_BYTE_SCATTER_BYTE)
             sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
         sel.pop();
-        sel.BYTE_SCATTER(addr, tmp, elemSize, bti);
+        sel.BYTE_SCATTER(address, tmp, elemSize, b, bti.isConst ? NULL : &tmpFlag);
       }
     }
 
-    INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, uint8_t bti, bool isUniform) const {
-      if(bti == 0xfe)
-        return address;
 
-      sel.push();
-        sel.curr.noMask = 1;
-        if (isUniform)
-          sel.curr.execWidth = 1;
-        GenRegister temp = sel.selReg(sel.reg(ir::FAMILY_DWORD, isUniform), ir::TYPE_U32);
-        sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti), ir::TYPE_U32)));
-      sel.pop();
-      return temp;
+    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::StoreInstruction &insn) const {
+      using namespace ir;
+      SelectionDAG *child0 = dag.child[0];
+      ir::BTI b;
+      if (insn.isFixedBTI()) {
+        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
+        const auto imm = immInsn.getImmediate();
+        b.isConst = 1;
+        b.imm = imm.getIntegerValue();
+      } else {
+        b.isConst = 0;
+        b.reg = insn.getBTI();
+      }
+      return b;
     }
-
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::StoreInstruction &insn, bool &markChildren) const
+    virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
     {
       using namespace ir;
+      const ir::StoreInstruction &insn = cast<ir::StoreInstruction>(dag.insn);
+      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
-      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
 
       const bool isUniform = sel.isScalarReg(insn.getAddress()) && sel.isScalarReg(insn.getValue(0));
+      BTI bti = getBTI(dag, insn);
 
-      BTI bti = insn.getBTI();
-      for (int x = 0; x < bti.count; x++) {
-        GenRegister temp = getRelativeAddress(sel, address, bti.bti[x], isUniform);
-        if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-          this->emitWrite64(sel, insn, temp, bti.bti[x]);
-        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-          this->emitUntypedWrite(sel, insn, temp,  bti.bti[x]);
-        else {
-          this->emitByteScatter(sel, insn, elemSize, temp, bti.bti[x], isUniform);
-        }
+      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+        this->emitWrite64(sel, insn, address, bti);
+      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+        this->emitUntypedWrite(sel, insn, address,  bti);
+      else {
+        this->emitByteScatter(sel, insn, elemSize, address, bti, isUniform);
+      }
+
+      if (insn.isFixedBTI()) {
+        markAllChildrenExceptBTI(dag);
+      } else {
+        markAllChildren(dag);
       }
       return true;
     }
-    DECL_CTOR(StoreInstruction, 1, 1);
   };
 
   /*! Compare instruction pattern */
@@ -4178,38 +4264,61 @@ namespace gbe
     DECL_CTOR(ConvertInstruction, 1, 1);
   };
 
-  /*! Convert instruction pattern */
-  DECL_PATTERN(AtomicInstruction)
+  /*! atomic instruction pattern */
+  class AtomicInstructionPattern : public SelectionPattern
   {
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::AtomicInstruction &insn, bool &markChildren) const
-    {
+  public:
+    AtomicInstructionPattern(void) : SelectionPattern(1,1) {
+      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+        if (ir::isOpcodeFrom<ir::AtomicInstruction>(ir::Opcode(op)) == true)
+          this->opcodes.push_back(ir::Opcode(op));
+    }
+
+    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::AtomicInstruction &insn) const {
+      using namespace ir;
+      SelectionDAG *child0 = dag.child[0];
+      ir::BTI b;
+      if (insn.isFixedBTI()) {
+        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
+        const auto imm = immInsn.getImmediate();
+        b.isConst = 1;
+        b.imm = imm.getIntegerValue();
+      } else {
+        b.isConst = 0;
+        b.reg = insn.getBTI();
+      }
+      return b;
+    }
+
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
       using namespace ir;
+      const ir::AtomicInstruction &insn = cast<ir::AtomicInstruction>(dag.insn);
+
+      ir::BTI b = getBTI(dag, insn);
       const AtomicOps atomicOp = insn.getAtomicOpcode();
-      const AddressSpace space = insn.getAddressSpace();
-      const uint32_t srcNum = insn.getSrcNum();
+      unsigned srcNum = insn.getSrcNum();
+      unsigned opNum = srcNum - 1;
 
-      GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32);   //address
-      GenRegister src1 = src0, src2 = src0;
-      if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
-      if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
       GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
+      GenRegister bti =  b.isConst ? GenRegister::immud(b.imm) : sel.selReg(b.reg, ir::TYPE_U32);
+      GenRegister src0 = sel.selReg(insn.getSrc(1), TYPE_U32);   //address
+      GenRegister src1 = src0, src2 = src0;
+      if(srcNum > 2) src1 = sel.selReg(insn.getSrc(2), TYPE_U32);
+      if(srcNum > 3) src2 = sel.selReg(insn.getSrc(3), TYPE_U32);
+
+      GenRegister flagTemp = sel.selReg(sel.reg(FAMILY_WORD, true), TYPE_U16);
+
       GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
-      if(space == MEM_LOCAL) {
-        sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, 0xfe);
+
+      sel.ATOMIC(dst, genAtomicOp, opNum, src0, src1, src2, bti, b.isConst ? NULL : &flagTemp);
+
+      if (insn.isFixedBTI()) {
+        markAllChildrenExceptBTI(dag);
       } else {
-        ir::BTI b = insn.getBTI();
-        for (int x = 0; x < b.count; x++) {
-          sel.push();
-            sel.curr.noMask = 1;
-            GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
-            sel.ADD(temp, src0, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(b.bti[x]), ir::TYPE_U32)));
-          sel.pop();
-          sel.ATOMIC(dst, genAtomicOp, srcNum, temp, src1, src2, b.bti[x]);
-        }
+        markAllChildren(dag);
       }
       return true;
     }
-    DECL_CTOR(AtomicInstruction, 1, 1);
   };
 
   /*! Select instruction pattern */
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 7c9bce5..37963f9 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -100,7 +100,7 @@ namespace gbe
       struct {
         /*! Store bti for loads/stores and function for math, atomic and compares */
         uint16_t function:8;
-        /*! elemSize for byte scatters / gathers, elemNum for untyped msg, bti for atomic */
+        /*! elemSize for byte scatters / gathers, elemNum for untyped msg, operand number for atomic */
         uint16_t elem:8;
       };
       struct {
@@ -149,14 +149,7 @@ namespace gbe
     INLINE uint32_t getbti() const {
       GBE_ASSERT(isRead() || isWrite());
       switch (opcode) {
-        case SEL_OP_ATOMIC: return extra.elem;
-        case SEL_OP_BYTE_SCATTER:
-        case SEL_OP_WRITE64:
-        case SEL_OP_DWORD_GATHER:
-        case SEL_OP_UNTYPED_WRITE:
-        case SEL_OP_UNTYPED_READ:
-        case SEL_OP_BYTE_GATHER:
-        case SEL_OP_READ64: return extra.function;
+        case SEL_OP_DWORD_GATHER: return extra.function;
         case SEL_OP_SAMPLE: return extra.rdbti;
         case SEL_OP_TYPED_WRITE: return extra.bti;
         default:
@@ -168,14 +161,7 @@ namespace gbe
     INLINE void setbti(uint32_t bti) {
       GBE_ASSERT(isRead() || isWrite());
       switch (opcode) {
-        case SEL_OP_ATOMIC: extra.elem = bti; return;
-        case SEL_OP_BYTE_SCATTER:
-        case SEL_OP_WRITE64:
-        case SEL_OP_UNTYPED_WRITE:
-        case SEL_OP_DWORD_GATHER:
-        case SEL_OP_UNTYPED_READ:
-        case SEL_OP_BYTE_GATHER:
-        case SEL_OP_READ64: extra.function = bti; return;
+        case SEL_OP_DWORD_GATHER: extra.function = bti; return;
         case SEL_OP_SAMPLE: extra.rdbti = bti; return;
         case SEL_OP_TYPED_WRITE: extra.bti = bti; return;
         default:
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 581f823..ed1f572 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -838,6 +838,8 @@ namespace gbe
       reg.absolute = 0;
       reg.vstride = 0;
       reg.hstride = 0;
+      reg.a0_subnr = 0;
+      reg.addr_imm = 0;
       return reg;
     }
 
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 8c171f5..3637ebb 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -103,6 +103,7 @@ enum gbe_curbe_type {
   GBE_CURBE_ONE,
   GBE_CURBE_LANE_ID,
   GBE_CURBE_SLM_OFFSET,
+  GBE_CURBE_BTI_UTIL,
 };
 
 /*! Extra arguments use the negative range of sub-values */
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index af65ff3..54265d0 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -190,22 +190,22 @@ namespace ir {
 
     /*! LOAD with the destinations directly specified */
     template <typename... Args>
-    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
+    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, bool fixedBTI, Register bti, Args...values)
     {
       const Tuple index = this->tuple(values...);
       const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
       GBE_ASSERT(valueNum > 0);
-      this->LOAD(type, index, offset, space, valueNum, dwAligned, bti);
+      this->LOAD(type, index, offset, space, valueNum, dwAligned, fixedBTI, bti);
     }
 
     /*! STORE with the sources directly specified */
     template <typename... Args>
-    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
+    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, bool fixedBTI, Register bti, Args...values)
     {
       const Tuple index = this->tuple(values...);
       const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
       GBE_ASSERT(valueNum > 0);
-      this->STORE(type, index, offset, space, valueNum, dwAligned, bti);
+      this->STORE(type, index, offset, space, valueNum, dwAligned, fixedBTI, bti);
     }
     void appendSurface(uint8_t bti, Register reg) { fn->appendSurface(bti, reg); }
 
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 7723b90..fe8807e 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -318,14 +318,14 @@ namespace ir {
 
     class ALIGNED_INSTRUCTION AtomicInstruction :
       public BasePolicy,
-      public TupleSrcPolicy<AtomicInstruction>,
       public NDstPolicy<AtomicInstruction, 1>
     {
     public:
       AtomicInstruction(AtomicOps atomicOp,
                          Register dst,
                          AddressSpace addrSpace,
-                         BTI bti,
+                         Register bti,
+                         bool fixedBTI,
                          Tuple src)
       {
         this->opcode = OP_ATOMIC;
@@ -334,23 +334,43 @@ namespace ir {
         this->src = src;
         this->addrSpace = addrSpace;
         this->bti = bti;
+        this->fixedBTI = fixedBTI ? 1: 0;
         srcNum = 2;
         if((atomicOp == ATOMIC_OP_INC) ||
           (atomicOp == ATOMIC_OP_DEC))
           srcNum = 1;
         if(atomicOp == ATOMIC_OP_CMPXCHG)
           srcNum = 3;
+        srcNum++;
       }
+      INLINE Register getSrc(const Function &fn, uint32_t ID) const {
+        GBE_ASSERTM(ID < srcNum, "Out-of-bound source register for atomic");
+        if (ID == 0u)
+          return bti;
+        else
+          return fn.getRegister(src, ID -1);
+      }
+      INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
+        GBE_ASSERTM(ID < srcNum, "Out-of-bound source register for atomic");
+        if (ID == 0u)
+          bti = reg;
+        else
+          fn.setRegister(src, ID - 1, reg);
+      }
+      INLINE uint32_t getSrcNum(void) const { return srcNum; }
+
       INLINE AddressSpace getAddressSpace(void) const { return this->addrSpace; }
-      INLINE BTI getBTI(void) const { return bti; }
+      INLINE Register getBTI(void) const { return bti; }
+      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
       INLINE AtomicOps getAtomicOpcode(void) const { return this->atomicOp; }
       INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       Register dst[1];
       Tuple src;
       AddressSpace addrSpace; //!< Address space
-      BTI bti;               //!< bti
-      uint8_t srcNum:2;     //!<Source Number
+      Register bti;               //!< bti
+      uint8_t fixedBTI:1;      //!< fixed bti or not
+      uint8_t srcNum:3;     //!<Source Number
       AtomicOps atomicOp:6;     //!<Source Number
     };
 
@@ -410,7 +430,7 @@ namespace ir {
 
     class ALIGNED_INSTRUCTION LoadInstruction :
       public BasePolicy,
-      public NSrcPolicy<LoadInstruction, 1>
+      public NSrcPolicy<LoadInstruction, 2>
     {
     public:
       LoadInstruction(Type type,
@@ -419,7 +439,8 @@ namespace ir {
                       AddressSpace addrSpace,
                       uint32_t valueNum,
                       bool dwAligned,
-                      BTI bti)
+                      bool fixedBTI,
+                      Register bti)
       {
         GBE_ASSERT(valueNum < 128);
         this->opcode = OP_LOAD;
@@ -429,6 +450,7 @@ namespace ir {
         this->addrSpace = addrSpace;
         this->valueNum = valueNum;
         this->dwAligned = dwAligned ? 1 : 0;
+        this->fixedBTI = fixedBTI ? 1 : 0;
         this->bti = bti;
       }
       INLINE Register getDst(const Function &fn, uint32_t ID) const {
@@ -443,16 +465,18 @@ namespace ir {
       INLINE Type getValueType(void) const { return type; }
       INLINE uint32_t getValueNum(void) const { return valueNum; }
       INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
-      INLINE BTI getBTI(void) const { return bti; }
+      INLINE Register getBTI(void) const { return bti; }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       INLINE bool isAligned(void) const { return !!dwAligned; }
+      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
       Type type;              //!< Type to store
       Register src[0];        //!< Address where to load from
+      Register bti;
       Register offset;        //!< Alias to make it similar to store
       Tuple values;           //!< Values to load
       AddressSpace addrSpace; //!< Where to load
-      BTI bti;
+      uint8_t fixedBTI:1;
       uint8_t valueNum:7;     //!< Number of values to load
       uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
     };
@@ -467,7 +491,8 @@ namespace ir {
                        AddressSpace addrSpace,
                        uint32_t valueNum,
                        bool dwAligned,
-                       BTI bti)
+                       bool fixedBTI,
+                       Register bti)
       {
         GBE_ASSERT(valueNum < 255);
         this->opcode = OP_STORE;
@@ -477,35 +502,42 @@ namespace ir {
         this->addrSpace = addrSpace;
         this->valueNum = valueNum;
         this->dwAligned = dwAligned ? 1 : 0;
+        this->fixedBTI = fixedBTI ? 1 : 0;
         this->bti = bti;
       }
       INLINE Register getSrc(const Function &fn, uint32_t ID) const {
-        GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
+        GBE_ASSERTM(ID < valueNum + 2u, "Out-of-bound source register for store");
         if (ID == 0u)
+          return bti;
+        else if (ID == 1u)
           return offset;
         else
-          return fn.getRegister(values, ID - 1);
+          return fn.getRegister(values, ID - 2);
       }
       INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
-        GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
+        GBE_ASSERTM(ID < valueNum + 2u, "Out-of-bound source register for store");
         if (ID == 0u)
+          bti = reg;
+        else if (ID == 1u)
           offset = reg;
         else
-          fn.setRegister(values, ID - 1, reg);
+          fn.setRegister(values, ID - 2, reg);
       }
-      INLINE uint32_t getSrcNum(void) const { return valueNum + 1u; }
+      INLINE uint32_t getSrcNum(void) const { return valueNum + 2u; }
       INLINE uint32_t getValueNum(void) const { return valueNum; }
       INLINE Type getValueType(void) const { return type; }
       INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
-      INLINE BTI getBTI(void) const { return bti; }
+      INLINE Register getBTI(void) const { return bti; }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       INLINE bool isAligned(void) const { return !!dwAligned; }
+      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
       Type type;              //!< Type to store
+      Register bti;
       Register offset;        //!< First source is the offset where to store
       Tuple values;           //!< Values to store
       AddressSpace addrSpace; //!< Where to store
-      BTI bti;                //!< Which btis need access
+      uint8_t fixedBTI:1;                //!< Which btis need access
       uint8_t valueNum:7;     //!< Number of values to store
       uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
       Register dst[0];        //!< No destination
@@ -961,10 +993,12 @@ namespace ir {
         return false;
       if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false))
         return false;
-      for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
-        if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID), fn, whyNot) == false))
+      for (uint32_t srcID = 0; srcID < srcNum-1; ++srcID)
+        if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID+1u), fn, whyNot) == false))
           return false;
 
+      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, bti, fn, whyNot) == false))
+        return false;
       return true;
     }
 
@@ -1165,12 +1199,10 @@ namespace ir {
       this->outOpcode(out);
       out << "." << addrSpace;
       out << " %" << this->getDst(fn, 0);
-      out << " {" << "%" << this->getSrc(fn, 0) << "}";
-      for (uint32_t i = 1; i < srcNum; ++i)
+      out << " {" << "%" << this->getSrc(fn, 1) << "}";
+      for (uint32_t i = 2; i < srcNum; ++i)
         out << " %" << this->getSrc(fn, i);
-      out << " bti";
-      for (uint32_t i = 0; i < bti.count; ++i)
-        out << ": " << (int)bti.bti[i];
+      out <<  (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
     }
 
 
@@ -1204,22 +1236,18 @@ namespace ir {
       for (uint32_t i = 0; i < valueNum; ++i)
         out << "%" << this->getDst(fn, i) << (i != (valueNum-1u) ? " " : "");
       out << "}";
-      out << " %" << this->getSrc(fn, 0);
-      out << " bti";
-      for (uint32_t i = 0; i < bti.count; ++i)
-        out << ": " << (int)bti.bti[i];
+      out << " %" << this->getSrc(fn, 1);
+      out << (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
     }
 
     INLINE void StoreInstruction::out(std::ostream &out, const Function &fn) const {
       this->outOpcode(out);
       out << "." << type << "." << addrSpace << (dwAligned ? "." : ".un") << "aligned";
-      out << " %" << this->getSrc(fn, 0) << " {";
+      out << " %" << this->getSrc(fn, 1) << " {";
       for (uint32_t i = 0; i < valueNum; ++i)
-        out << "%" << this->getSrc(fn, i+1) << (i != (valueNum-1u) ? " " : "");
+        out << "%" << this->getSrc(fn, i+2) << (i != (valueNum-1u) ? " " : "");
       out << "}";
-      out << " bti";
-      for (uint32_t i = 0; i < bti.count; ++i)
-        out << ": " << (int)bti.bti[i];
+      out <<  (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
     }
 
     INLINE void ReadARFInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1560,18 +1588,18 @@ DECL_MEM_FN(BitCastInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
-DECL_MEM_FN(AtomicInstruction, BTI, getBTI(void), getBTI())
 DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode())
+DECL_MEM_FN(AtomicInstruction, bool, isFixedBTI(void), isFixedBTI())
 DECL_MEM_FN(StoreInstruction, Type, getValueType(void), getValueType())
 DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
 DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
-DECL_MEM_FN(StoreInstruction, BTI, getBTI(void), getBTI())
 DECL_MEM_FN(StoreInstruction, bool, isAligned(void), isAligned())
+DECL_MEM_FN(StoreInstruction, bool, isFixedBTI(void), isFixedBTI())
 DECL_MEM_FN(LoadInstruction, Type, getValueType(void), getValueType())
 DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
 DECL_MEM_FN(LoadInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
-DECL_MEM_FN(LoadInstruction, BTI, getBTI(void), getBTI())
 DECL_MEM_FN(LoadInstruction, bool, isAligned(void), isAligned())
+DECL_MEM_FN(LoadInstruction, bool, isFixedBTI(void), isFixedBTI())
 DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
 DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
 DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated())
@@ -1735,8 +1763,8 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   }
 
   // For all unary functions with given opcode
-  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, BTI bti, Tuple src) {
-    return internal::AtomicInstruction(atomicOp, dst, space, bti, src).convert();
+  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, Register bti, bool fixedBTI, Tuple src) {
+    return internal::AtomicInstruction(atomicOp, dst, space, bti, fixedBTI, src).convert();
   }
 
   // BRA
@@ -1784,9 +1812,10 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
                    AddressSpace space, \
                    uint32_t valueNum, \
                    bool dwAligned, \
-                   BTI bti) \
+                   bool fixedBTI, \
+                   Register bti) \
   { \
-    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,bti).convert(); \
+    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,fixedBTI,bti).convert(); \
   }
 
   DECL_EMIT_FUNCTION(LOAD, LoadInstruction)
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 436bfd2..23a7d00 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -36,10 +36,13 @@
 namespace gbe {
 namespace ir {
   struct BTI {
-    uint8_t bti[MAX_MIXED_POINTER];
-    uint8_t count;
-    BTI() : count(0) {
-      memset(bti, 0, MAX_MIXED_POINTER);
+    uint8_t isConst; // whether fixed bti
+    union {
+      Register reg;  // mixed reg
+      unsigned short imm;  // fixed bti
+    };
+
+    BTI() : isConst(0) {
     }
     ~BTI() {}
   };
@@ -289,10 +292,12 @@ namespace ir {
   class AtomicInstruction : public Instruction {
   public:
     /*! Where the address register goes */
-    static const uint32_t addressIndex = 0;
+    static const uint32_t btiIndex = 0;
+    static const uint32_t addressIndex = 1;
     /*! Address space that is manipulated here */
     AddressSpace getAddressSpace(void) const;
-    BTI getBTI(void) const;
+    Register getBTI(void) const { return this->getSrc(btiIndex); }
+    bool isFixedBTI(void) const;
     /*! Return the atomic function code */
     AtomicOps getAtomicOpcode(void) const;
     /*! Return the register that contains the addresses */
@@ -307,12 +312,14 @@ namespace ir {
   class StoreInstruction : public Instruction {
   public:
     /*! Where the address register goes */
-    static const uint32_t addressIndex = 0;
+    static const uint32_t btiIndex = 0;
+    static const uint32_t addressIndex = 1;
     /*! Return the types of the values to store */
     Type getValueType(void) const;
     /*! Give the number of values the instruction is storing (srcNum-1) */
     uint32_t getValueNum(void) const;
-    BTI getBTI(void) const;
+    Register getBTI(void) const { return this->getSrc(btiIndex); }
+    bool isFixedBTI(void) const;
     /*! Address space that is manipulated here */
     AddressSpace getAddressSpace(void) const;
     /*! DWORD aligned means untyped read for Gen. That is what matters */
@@ -322,7 +329,7 @@ namespace ir {
     /*! Return the register that contain value valueID */
     INLINE Register getValue(uint32_t valueID) const {
       GBE_ASSERT(valueID < this->getValueNum());
-      return this->getSrc(valueID + 1u);
+      return this->getSrc(valueID + 2u);
     }
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
@@ -343,8 +350,9 @@ namespace ir {
     /*! DWORD aligned means untyped read for Gen. That is what matters */
     bool isAligned(void) const;
     /*! Return the register that contains the addresses */
-    INLINE Register getAddress(void) const { return this->getSrc(0u); }
-    BTI getBTI(void) const;
+    INLINE Register getAddress(void) const { return this->getSrc(1u); }
+    Register getBTI(void) const {return this->getSrc(0u);}
+    bool isFixedBTI(void) const;
     /*! Return the register that contain value valueID */
     INLINE Register getValue(uint32_t valueID) const {
       return this->getDst(valueID);
@@ -697,7 +705,7 @@ namespace ir {
   /*! F32TO16.{dstType <- srcType} dst src */
   Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src);
   /*! atomic dst addr.space {src1 {src2}} */
-  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, BTI bti, Tuple src);
+  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, Register bti, bool fixedBTI, Tuple src);
   /*! bra labelIndex */
   Instruction BRA(LabelIndex labelIndex);
   /*! (pred) bra labelIndex */
@@ -713,9 +721,9 @@ namespace ir {
   /*! ret */
   Instruction RET(void);
   /*! load.type.space {dst1,...,dst_valueNum} offset value */
-  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
+  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, bool fixedBTI, Register bti);
   /*! store.type.space offset {src1,...,src_valueNum} value */
-  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
+  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, bool fixedBTI, Register bti);
   /*! loadi.type dst value */
   Instruction LOADI(Type type, Register dst, ImmediateIndex value);
   /*! sync.params... (see Sync instruction) */
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 2f6539a..af9f698 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -45,7 +45,8 @@ namespace ir {
         "printf_buffer_pointer", "printf_index_buffer_pointer",
         "dwblockip",
         "lane_id",
-        "invalid"
+        "invalid",
+        "bti_utility"
     };
 
 #if GBE_DEBUG
@@ -91,6 +92,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0);
       DECL_NEW_REG(FAMILY_DWORD, laneid, 0);
       DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
+      DECL_NEW_REG(FAMILY_DWORD, btiUtil, 1);
     }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 4de6fe0..9323824 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -74,7 +74,8 @@ namespace ir {
     static const Register dwblockip = Register(30);  // blockip
     static const Register laneid = Register(31);  // lane id.
     static const Register invalid = Register(32);  // used for valid comparation.
-    static const uint32_t regNum = 33;             // number of special registers
+    static const Register btiUtil = Register(33);  // used for mixed pointer as bti utility.
+    static const uint32_t regNum = 34;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 6bde7bf..0c29e03 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -87,6 +87,7 @@
 #endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
+#include "llvm/IR/IRBuilder.h"
 #if LLVM_VERSION_MINOR <= 2
 #include "llvm/Intrinsics.h"
 #include "llvm/IntrinsicInst.h"
@@ -290,11 +291,8 @@ namespace gbe
     return ir::MEM_GLOBAL;
   }
 
-  static INLINE ir::AddressSpace btiToGen(const ir::BTI &bti) {
-    if (bti.count > 1)
-      return ir::MEM_MIXED;
-    uint8_t singleBti = bti.bti[0];
-    switch (singleBti) {
+  static INLINE ir::AddressSpace btiToGen(const unsigned bti) {
+    switch (bti) {
       case BTI_CONSTANT: return ir::MEM_CONSTANT;
       case BTI_PRIVATE: return  ir::MEM_PRIVATE;
       case BTI_LOCAL: return ir::MEM_LOCAL;
@@ -485,7 +483,14 @@ namespace gbe
 
     map<Value *, SmallVector<Value *, 4>> pointerOrigMap;
     typedef map<Value *, SmallVector<Value *, 4>>::iterator PtrOrigMapIter;
-
+    // map pointer source to bti
+    map<Value *, unsigned> BtiMap;
+    // map ptr to its bti register
+    map<Value *, Value *> BtiValueMap;
+    // map ptr to it's base
+    map<Value *, Value *> pointerBaseMap;
+
+    typedef map<Value *, Value *>::iterator PtrBaseMapIter;
     /*! We visit each function twice. Once to allocate the registers and once to
      *  emit the Gen IR instructions
      */
@@ -501,6 +506,7 @@ namespace gbe
     } ConstTypeId;
 
     LoopInfo *LI;
+    Function *Func;
     const Module *TheModule;
     int btiBase;
   public:
@@ -547,22 +553,33 @@ namespace gbe
       bool bKernel = isKernelFunction(F);
       if(!bKernel) return false;
 
+      Func = &F;
+      assignBti(F);
       analyzePointerOrigin(F);
+
       LI = &getAnalysis<LoopInfo>();
       emitFunction(F);
       phiMap.clear();
       globalPointer.clear();
       pointerOrigMap.clear();
+      BtiMap.clear();
+      BtiValueMap.clear();
+      pointerBaseMap.clear();
       // Reset for next function
       btiBase = BTI_RESERVED_NUM;
       return false;
     }
     /*! Given a possible pointer value, find out the interested escape like
         load/store or atomic instruction */
-    void findPointerEscape(Value *ptr);
+    void findPointerEscape(Value *ptr, std::set<Value *> &mixedPtr, bool recordMixed);
     /*! For all possible pointers, GlobalVariable, function pointer argument,
         alloca instruction, find their pointer escape points */
     void analyzePointerOrigin(Function &F);
+    unsigned getNewBti(Value *origin);
+    void assignBti(Function &F);
+    bool isSingleBti(Value *Val);
+    Value *getBtiRegister(Value *v);
+    Value *getPointerBase(Value *ptr);
 
     virtual bool doFinalization(Module &M) { return false; }
     /*! handle global variable register allocation (local, constant space) */
@@ -660,10 +677,10 @@ namespace gbe
     // batch vec4/8/16 load/store
     INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
                   Value *llvmValue, const ir::Register ptr,
-                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti,
-                  bool dwAligned);
+                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::Register bti,
+                  bool dwAligned, bool fixedBTI);
     // handle load of dword/qword with unaligned address
-    void emitUnalignedDQLoadStore(Value *llvmPtr, Value *llvmValues, ir::AddressSpace addrSpace, ir::BTI &binding, bool isLoad, bool dwAligned);
+    void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI);
     void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
     private:
       ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
@@ -675,7 +692,44 @@ namespace gbe
 
   char GenWriter::ID = 0;
 
-  void GenWriter::findPointerEscape(Value *ptr) {
+  static void updatePointerSource(Value *parent, Value *theUser, Value *source, SmallVector<Value *, 4> &pointers) {
+    if (isa<SelectInst>(theUser)) {
+      SelectInst *si = dyn_cast<SelectInst>(theUser);
+      if (si->getTrueValue() == parent)
+        pointers[0] = source;
+      else
+        pointers[1] = source;
+    } else if (isa<PHINode>(theUser)) {
+      PHINode *phi = dyn_cast<PHINode>(theUser);
+      unsigned opNum = phi->getNumIncomingValues();
+      for (unsigned j = 0; j < opNum; j++) {
+        if (phi->getIncomingValue(j) == parent) {
+          pointers[j] = source;
+        }
+      }
+    } else {
+      pointers[0] = source;
+    }
+  }
+
+  bool isMixedPoint(Value *val, SmallVector<Value *, 4> &pointers) {
+    Value *validSrc = NULL;
+    unsigned i = 0;
+    if (pointers.size() < 2) return false;
+    while(i < pointers.size()) {
+      if (pointers[i] != NULL && validSrc != NULL && pointers[i] != validSrc)
+        return true;
+      // when source is same as itself, we don't treat it as a new source
+      // this often occurs for PHINode
+      if (pointers[i] != NULL && validSrc == NULL && pointers[i] != val) {
+        validSrc = pointers[i];
+      }
+      i++;
+    }
+    return false;
+  }
+
+  void GenWriter::findPointerEscape(Value *ptr,  std::set<Value *> &mixedPtr, bool bFirstPass) {
     std::vector<Value*> workList;
     std::set<Value *> visited;
 
@@ -695,7 +749,52 @@ namespace gbe
   #else
         User *theUser = iter->getUser();
   #endif
-        if (visited.find(theUser) != visited.end()) continue;
+        bool visitedInThisSource = visited.find(theUser) != visited.end();
+
+        if (isa<SelectInst>(theUser) || isa<PHINode>(theUser))
+        {
+          // reached from another source, update pointer source
+          PtrOrigMapIter ptrIter = pointerOrigMap.find(theUser);
+          if (ptrIter == pointerOrigMap.end()) {
+            // create new one
+            unsigned capacity = 1;
+            if (isa<SelectInst>(theUser)) capacity = 2;
+            if (isa<PHINode>(theUser)) {
+              PHINode *phi = dyn_cast<PHINode>(theUser);
+              capacity = phi->getNumIncomingValues();
+            }
+
+            SmallVector<Value *, 4> pointers;
+
+            unsigned k = 0;
+            while (k++ < capacity) {
+              pointers.push_back(NULL);
+            }
+
+            updatePointerSource(work, theUser, ptr, pointers);
+            pointerOrigMap.insert(std::make_pair(theUser, pointers));
+          } else {
+            // update pointer source
+            updatePointerSource(work, theUser, ptr, (*ptrIter).second);
+          }
+          ptrIter = pointerOrigMap.find(theUser);
+
+          if (isMixedPoint(theUser, (*ptrIter).second)) {
+            // for the first pass, we need to record the mixed point instruction.
+            // for the second pass, we don't need to go further, the reason is:
+            // we always use it's 'direct mixed pointer parent' as origin, if we don't
+            // stop here, we may set wrong pointer origin.
+            if (bFirstPass)
+              mixedPtr.insert(theUser);
+            else
+              continue;
+          }
+          // don't fall into dead loop,
+          if (visitedInThisSource || theUser == ptr) {
+            continue;
+          }
+        }
+
         // pointer address is used as the ValueOperand in store instruction, should be skipped
         if (StoreInst *load = dyn_cast<StoreInst>(theUser)) {
           if (load->getValueOperand() == work) {
@@ -710,16 +809,30 @@ namespace gbe
             Function *F = dyn_cast<CallInst>(theUser)->getCalledFunction();
             if (!F || F->getIntrinsicID() != 0) continue;
           }
+          Value *pointer = NULL;
+          if (isa<LoadInst>(theUser)) {
+            pointer = dyn_cast<LoadInst>(theUser)->getPointerOperand();
+          } else if (isa<StoreInst>(theUser)) {
+            pointer = dyn_cast<StoreInst>(theUser)->getPointerOperand();
+          } else if (isa<CallInst>(theUser)) {
+            // atomic/read(write)image
+            CallInst *ci = dyn_cast<CallInst>(theUser);
+            pointer = ci->getArgOperand(0);
+          } else {
+            theUser->dump();
+            GBE_ASSERT(0 && "Unknown instruction operating on pointers\n");
+          }
 
-          PtrOrigMapIter ptrIter = pointerOrigMap.find(theUser);
+          // load/store/atomic instruction, we have reached the end, stop further traversing
+          PtrOrigMapIter ptrIter = pointerOrigMap.find(pointer);
           if (ptrIter == pointerOrigMap.end()) {
             // create new one
             SmallVector<Value *, 4> pointers;
             pointers.push_back(ptr);
-            pointerOrigMap.insert(std::make_pair(theUser, pointers));
+            pointerOrigMap.insert(std::make_pair(pointer, pointers));
           } else {
-            // append it
-            (*ptrIter).second.push_back(ptr);
+            // update the pointer source here,
+            (*ptrIter).second[0] = ptr;
           }
         } else {
           workList.push_back(theUser);
@@ -727,28 +840,292 @@ namespace gbe
       }
     }
   }
+  bool GenWriter::isSingleBti(Value *Val) {
+    // self + others same --> single
+    // all same  ---> single
+    if (!isa<SelectInst>(Val) && !isa<PHINode>(Val)) {
+      return true;
+    } else {
+      PtrOrigMapIter iter = pointerOrigMap.find(Val);
+      SmallVector<Value *, 4> &pointers = (*iter).second;
+      unsigned srcNum = pointers.size();
+      Value *source = NULL;
+      for (unsigned x = 0; x < srcNum; x++) {
+        // often happend in phiNode where one source is same as PHINode itself, skip it
+        if (pointers[x] == Val) continue;
+
+        if (source == NULL) source = pointers[x];
+        else {
+          if (source != pointers[x])
+            return false;
+        }
+      }
+      return true;
+    }
+  }
+  Value *GenWriter::getPointerBase(Value *ptr) {
+    PtrBaseMapIter baseIter = pointerBaseMap.find(ptr);
+    if (baseIter != pointerBaseMap.end()) {
+      return baseIter->second;
+    }
+    typedef std::map<Value *, unsigned>::iterator BtiIter;
+    // for pointers that already assigned a bti, it is the base pointer,
+    BtiIter found = BtiMap.find(ptr);
+    if (found != BtiMap.end()) {
+      if (isa<PointerType>(ptr->getType())) {
+        PointerType *ty = cast<PointerType>(ptr->getType());
+        // only global pointer will have starting address
+        if (ty->getAddressSpace() == 1) {
+          return ptr;
+        } else {
+          return ConstantPointerNull::get(ty);
+        }
+      } else {
+          PointerType *ty = PointerType::get(ptr->getType(), 0);
+          return ConstantPointerNull::get(ty);
+      }
+    }
+
+    PtrOrigMapIter iter = pointerOrigMap.find(ptr);
+    SmallVector<Value *, 4> &pointers = (*iter).second;
+    if (isSingleBti(ptr)) {
+      Value *base = getPointerBase(pointers[0]);
+      pointerBaseMap.insert(std::make_pair(ptr, base));
+      return base;
+    } else {
+      if (isa<SelectInst>(ptr)) {
+          SelectInst *si = dyn_cast<SelectInst>(ptr);
+          IRBuilder<> Builder(si->getParent());
+
+          Value *trueVal = getPointerBase((*iter).second[0]);
+          Value *falseVal = getPointerBase((*iter).second[1]);
+          Builder.SetInsertPoint(si);
+          Value *base = Builder.CreateSelect(si->getCondition(), trueVal, falseVal);
+          pointerBaseMap.insert(std::make_pair(ptr, base));
+        return base;
+      } else if (isa<PHINode>(ptr)) {
+          PHINode *phi = dyn_cast<PHINode>(ptr);
+          IRBuilder<> Builder(phi->getParent());
+          Builder.SetInsertPoint(phi);
+
+          PHINode *basePhi = Builder.CreatePHI(ptr->getType(), phi->getNumIncomingValues());
+          unsigned srcNum = pointers.size();
+          for (unsigned x = 0; x < srcNum; x++) {
+            Value *base = NULL;
+            if (pointers[x] != ptr) {
+              base = getPointerBase(pointers[x]);
+            } else {
+              base = basePhi;
+            }
+            basePhi->addIncoming(base, phi->getIncomingBlock(x));
+          }
+          pointerBaseMap.insert(std::make_pair(ptr, basePhi));
+          return basePhi;
+      } else {
+        ptr->dump();
+        GBE_ASSERT(0 && "Unhandled instruction in getBtiRegister\n");
+        return ptr;
+      }
+    }
+  }
+
+  Value *GenWriter::getBtiRegister(Value *Val) {
+    typedef std::map<Value *, unsigned>::iterator BtiIter;
+    typedef std::map<Value *, Value *>::iterator BtiValueIter;
+    BtiIter found = BtiMap.find(Val);
+    BtiValueIter valueIter = BtiValueMap.find(Val);
+    if (valueIter != BtiValueMap.end())
+      return valueIter->second;
+
+    if (found != BtiMap.end()) {
+      // the Val already got assigned an BTI, return it
+      Value *bti = ConstantInt::get(IntegerType::get(Val->getContext(), 32), found->second);
+      BtiValueMap.insert(std::make_pair(Val, bti));
+      return bti;
+    } else {
+      if (isSingleBti(Val)) {
+        PtrOrigMapIter iter = pointerOrigMap.find(Val);
+        Value * bti = getBtiRegister((*iter).second[0]);
+        BtiValueMap.insert(std::make_pair(Val, bti));
+        return bti;
+      } else {
+        if (isa<SelectInst>(Val)) {
+          SelectInst *si = dyn_cast<SelectInst>(Val);
+
+          IRBuilder<> Builder(si->getParent());
+          PtrOrigMapIter iter = pointerOrigMap.find(Val);
+          Value *trueVal = getBtiRegister((*iter).second[0]);
+          Value *falseVal = getBtiRegister((*iter).second[1]);
+          Builder.SetInsertPoint(si);
+          Value *bti = Builder.CreateSelect(si->getCondition(), trueVal, falseVal);
+          BtiValueMap.insert(std::make_pair(Val, bti));
+          return bti;
+        } else if (isa<PHINode>(Val)) {
+          PHINode *phi = dyn_cast<PHINode>(Val);
+          IRBuilder<> Builder(phi->getParent());
+          Builder.SetInsertPoint(phi);
+
+          PHINode *btiPhi = Builder.CreatePHI(IntegerType::get(Val->getContext(), 32), phi->getNumIncomingValues());
+          PtrOrigMapIter iter = pointerOrigMap.find(Val);
+          SmallVector<Value *, 4> &pointers = (*iter).second;
+          unsigned srcNum = pointers.size();
+          for (unsigned x = 0; x < srcNum; x++) {
+            Value *bti = NULL;
+            if (pointers[x] != Val) {
+              bti = getBtiRegister(pointers[x]);
+            } else {
+              bti = btiPhi;
+            }
+            btiPhi->addIncoming(bti, phi->getIncomingBlock(x));
+          }
+          BtiValueMap.insert(std::make_pair(Val, btiPhi));
+          return btiPhi;
+        } else {
+          Val->dump();
+          GBE_ASSERT(0 && "Unhandled instruction in getBtiRegister\n");
+          return Val;
+        }
+      }
+    }
+  }
+
+  unsigned GenWriter::getNewBti(Value *origin) {
+    unsigned new_bti = 0;
+    if(origin->getName().equals(StringRef("__gen_ocl_printf_buf"))) {
+      new_bti = btiBase;
+      incBtiBase();
+    } else if (origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
+      new_bti = btiBase;
+      incBtiBase();
+    }
+    else if (isa<GlobalVariable>(origin)
+        && dyn_cast<GlobalVariable>(origin)->isConstant()) {
+      new_bti = BTI_CONSTANT;
+    } else {
+      unsigned space = origin->getType()->getPointerAddressSpace();
+      switch (space) {
+        case 0:
+          new_bti = BTI_PRIVATE;
+          break;
+        case 1:
+        {
+          new_bti = btiBase;
+          incBtiBase();
+          break;
+        }
+        case 2:
+          new_bti = BTI_CONSTANT;
+
+          break;
+        case 3:
+          new_bti = BTI_LOCAL;
+          break;
+        default:
+          GBE_ASSERT(0);
+          break;
+      }
+    }
+    return new_bti;
+  }
+  static bool isImageType(std::string typeName) {
+    if (typeName.compare("image1d_t") == 0        ||
+        typeName.compare("image1d_array_t") == 0  ||
+        typeName.compare("image1d_buffer_t") == 0 ||
+        typeName.compare("image2d_t") == 0        ||
+        typeName.compare("image2d_array_t") == 0  ||
+        typeName.compare("image2d_buffer_t") == 0 ||
+        typeName.compare("image3d_t") == 0)
+      return true;
+    return false;
+  }
+
+  void GenWriter::assignBti(Function &F) {
+    Module::GlobalListType &globalList = const_cast<Module::GlobalListType &> (TheModule->getGlobalList());
+    for(auto i = globalList.begin(); i != globalList.end(); i ++) {
+      GlobalVariable &v = *i;
+      if(!v.isConstantUsed()) continue;
+
+      BtiMap.insert(std::make_pair(&v, getNewBti(&v)));
+    }
+    NamedMDNode *clKernels = TheModule->getNamedMetadata("opencl.kernels");
+    MDNode *typeNameNode = NULL;
+     uint32_t ops = clKernels->getNumOperands();
+      for(uint32_t x = 0; x < ops; x++) {
+        MDNode* node = clKernels->getOperand(x);
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+        Value * op = node->getOperand(0);
+#else
+        Value * op = cast<ValueAsMetadata>(node->getOperand(0))->getValue();
+#endif
+        if(op == &F) {
+          for(uint j = 0; j < node->getNumOperands() - 1; j++) {
+            MDNode *attrNode = dyn_cast_or_null<MDNode>(node->getOperand(1 + j));
+            if (attrNode == NULL) break;
+            MDString *attrName = dyn_cast_or_null<MDString>(attrNode->getOperand(0));
+            if (!attrName) continue;
+            if (attrName->getString() == "kernel_arg_type") {
+              typeNameNode = attrNode;
+            }
+          }
+        }
+      }
+
+    unsigned argID = 0;
+    for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I, argID++) {
+      std::string typeName= (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
+      if (I->getType()->isPointerTy() || isImageType(typeName)) {
+        BtiMap.insert(std::make_pair(I, getNewBti(I)));
+      }
+    }
+
+    BasicBlock &bb = F.getEntryBlock();
+    for (BasicBlock::iterator iter = bb.begin(), iterE = bb.end(); iter != iterE; ++iter) {
+      if (AllocaInst *ai = dyn_cast<AllocaInst>(iter)) {
+        BtiMap.insert(std::make_pair(ai, BTI_PRIVATE));
+      }
+    }
+  }
 
   void GenWriter::analyzePointerOrigin(Function &F) {
+    // used to record where the pointers get mixed (i.e. select or phi instruction)
+    std::set<Value *> mixedPtr;
+    // This is a two-pass algorithm, the 1st pass will try to update the pointer sources for
+    // every instruction reachable from pointers and record mix-point in this pass.
+    // The second pass will start from really mixed-pointer instruction like select or phinode.
+    // and update the sources correctly. For pointers reachable from mixed-pointer, we will set
+    // its direct mixed-pointer parent as it's pointer origin.
+
     // GlobalVariable
     Module::GlobalListType &globalList = const_cast<Module::GlobalListType &> (TheModule->getGlobalList());
     for(auto i = globalList.begin(); i != globalList.end(); i ++) {
       GlobalVariable &v = *i;
       if(!v.isConstantUsed()) continue;
-      findPointerEscape(&v);
+      findPointerEscape(&v, mixedPtr, true);
     }
     // function argument
     for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
       if (I->getType()->isPointerTy()) {
-        findPointerEscape(I);
+        findPointerEscape(I, mixedPtr, true);
       }
     }
     // alloca
     BasicBlock &bb = F.getEntryBlock();
     for (BasicBlock::iterator iter = bb.begin(), iterE = bb.end(); iter != iterE; ++iter) {
       if (AllocaInst *ai = dyn_cast<AllocaInst>(iter)) {
-        findPointerEscape(ai);
+        findPointerEscape(ai, mixedPtr, true);
       }
     }
+    // the second pass starts from mixed pointer
+    for (std::set<Value *>::iterator iter = mixedPtr.begin(); iter != mixedPtr.end(); ++iter) {
+      findPointerEscape(*iter, mixedPtr, false);
+    }
+
+    for (std::set<Value *>::iterator iter = mixedPtr.begin(); iter != mixedPtr.end(); ++iter) {
+      getBtiRegister(*iter);
+    }
+    for (std::set<Value *>::iterator iter = mixedPtr.begin(); iter != mixedPtr.end(); ++iter) {
+      getPointerBase(*iter);
+    }
   }
 
   void getSequentialData(const ConstantDataSequential *cda, void *ptr, uint32_t &offset) {
@@ -1419,7 +1796,7 @@ namespace gbe
         const ir::Register reg = getRegister(I);
         if (llvmInfo.isImageType()) {
           ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, 4, 4, 0);
-          ctx.getFunction().getImageSet()->append(reg, &ctx, incBtiBase());
+          ctx.getFunction().getImageSet()->append(reg, &ctx, BtiMap.find(I)->second);
           collectImageArgs(llvmInfo.accessQual, imageArgsInfo);
           continue;
         }
@@ -1452,10 +1829,7 @@ namespace gbe
             const uint32_t align = getAlignmentByte(unit, pointed);
               switch (addrSpace) {
               case ir::MEM_GLOBAL:
-                globalPointer.insert(std::make_pair(I, btiBase));
-                ctx.appendSurface(btiBase, reg);
-                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, btiBase);
-                incBtiBase();
+                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, BtiMap.find(I)->second);
               break;
               case ir::MEM_LOCAL:
                 ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg,  llvmInfo, ptrSize, align, BTI_LOCAL);
@@ -1806,14 +2180,10 @@ namespace gbe
         ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
       } else {
         if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
-          ctx.appendSurface(btiBase, ir::ocl::printfbptr);
-          ctx.getFunction().getPrintfSet()->setBufBTI(btiBase);
-          globalPointer.insert(std::make_pair(&v, incBtiBase()));
+          ctx.getFunction().getPrintfSet()->setBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
           regTranslator.newScalarProxy(ir::ocl::printfbptr, const_cast<GlobalVariable*>(&v));
         } else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
-          ctx.appendSurface(btiBase, ir::ocl::printfiptr);
-          ctx.getFunction().getPrintfSet()->setIndexBufBTI(btiBase);
-          globalPointer.insert(std::make_pair(&v, incBtiBase()));
+          ctx.getFunction().getPrintfSet()->setIndexBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
           regTranslator.newScalarProxy(ir::ocl::printfiptr, const_cast<GlobalVariable*>(&v));
         } else if(v.getName().str().substr(0, 4) == ".str") {
           /* When there are multi printf statements in multi kernel fucntions within the same
@@ -2045,6 +2415,7 @@ namespace gbe
     }
 
     ctx.startFunction(F.getName());
+
     ir::Function &fn = ctx.getFunction();
     this->regTranslator.clear();
     this->labelMap.clear();
@@ -2837,19 +3208,46 @@ namespace gbe
     CallSite::arg_iterator AE = CS.arg_end();
     GBE_ASSERT(AI != AE);
 
+    ir::AddressSpace addrSpace;
+
+    Value *llvmPtr = *AI;
+    Value *bti = getBtiRegister(llvmPtr);
+    Value *ptrBase = getPointerBase(llvmPtr);
+    ir::Register pointer = this->getRegister(llvmPtr);
+    ir::Register baseReg = this->getRegister(ptrBase);
+
+    ir::Register btiReg;
+    bool fixedBTI = false;
+    if (isa<ConstantInt>(bti)) {
+      fixedBTI = true;
+      unsigned index = cast<ConstantInt>(bti)->getZExtValue();
+      addrSpace = btiToGen(index);
+      ir::ImmediateIndex immIndex = ctx.newImmediate((uint32_t)index);
+      btiReg = ctx.reg(ir::FAMILY_DWORD);
+      ctx.LOADI(ir::TYPE_U32, btiReg, immIndex);
+    } else {
+      addrSpace = ir::MEM_MIXED;
+      btiReg = this->getRegister(bti);
+    }
+
+    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+    const ir::Register ptr = ctx.reg(pointerFamily);
+    ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
+
     const ir::Register dst = this->getRegister(&I);
 
-    ir::BTI bti;
-    gatherBTI(&I, bti);
-    const ir::AddressSpace addrSpace = btiToGen(bti);
-    vector<ir::Register> src;
     uint32_t srcNum = 0;
+    vector<ir::Register> src;
+    src.push_back(ptr);
+    srcNum++;
+    AI++;
+
     while(AI != AE) {
       src.push_back(this->getRegister(*(AI++)));
       srcNum++;
     }
     const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
-    ctx.ATOMIC(opcode, dst, addrSpace, bti, srcTuple);
+    ctx.ATOMIC(opcode, dst, addrSpace, btiReg, fixedBTI, srcTuple);
   }
 
   /* append a new sampler. should be called before any reference to
@@ -3546,8 +3944,8 @@ namespace gbe
   void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
                                       Value *llvmValues, const ir::Register ptr,
                                       const ir::AddressSpace addrSpace,
-                                      Type * elemType, bool isLoad, ir::BTI bti,
-                                      bool dwAligned) {
+                                      Type * elemType, bool isLoad, ir::Register bti,
+                                      bool dwAligned, bool fixedBTI) {
     const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
     uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
     uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
@@ -3593,79 +3991,18 @@ namespace gbe
 
       // Emit the instruction
       if (isLoad)
-        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
+        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, fixedBTI, bti);
       else
-        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
-    }
-  }
-
-  // The idea behind is to search along the use-def chain, and find out all
-  // possible sources of the pointer. Then in later codeGen, we can emit
-  // read/store instructions to these BTIs gathered.
-  void GenWriter::gatherBTI(Value *insn, ir::BTI &bti) {
-    PtrOrigMapIter iter = pointerOrigMap.find(insn);
-    if (iter != pointerOrigMap.end()) {
-      SmallVectorImpl<Value *> &origins = iter->second;
-      uint8_t nBTI = 0;
-      for (unsigned i = 0; i < origins.size(); i++) {
-        uint8_t new_bti = 0;
-        Value *origin = origins[i];
-        // all constant put into constant cache, including __constant & const __private
-        if (isa<GlobalVariable>(origin)
-            && dyn_cast<GlobalVariable>(origin)->isConstant()) {
-          new_bti = BTI_CONSTANT;
-        } else {
-          unsigned space = origin->getType()->getPointerAddressSpace();
-          switch (space) {
-            case 0:
-              new_bti = BTI_PRIVATE;
-              break;
-            case 1:
-            {
-              GlobalPtrIter iter = globalPointer.find(origin);
-              GBE_ASSERT(iter != globalPointer.end());
-              new_bti = iter->second;
-              break;
-            }
-            case 2:
-              new_bti = BTI_CONSTANT;
-              break;
-            case 3:
-              new_bti = BTI_LOCAL;
-              break;
-            default:
-              GBE_ASSERT(0 && "address space not unhandled in gatherBTI()\n");
-              break;
-          }
-        }
-
-        // avoid duplicate
-        bool bFound = false;
-        for (int j = 0; j < nBTI; j++) {
-          if (bti.bti[j] == new_bti) {
-            bFound = true; break;
-          }
-        }
-        if (bFound == false) {
-          bti.bti[nBTI++] = new_bti;
-          bti.count = nBTI;
-        }
-      }
-    } else {
-      insn->dump();
-      std::cerr << "Illegal pointer which is not from a valid memory space." << std::endl;
-      std::cerr << "Aborting..." << std::endl;
-      exit(-1);
+        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, fixedBTI, bti);
     }
-    GBE_ASSERT(bti.count <= MAX_MIXED_POINTER);
   }
+
   // handle load of dword/qword with unaligned address
-  void GenWriter::emitUnalignedDQLoadStore(Value *llvmPtr, Value *llvmValues, ir::AddressSpace addrSpace, ir::BTI &binding, bool isLoad, bool dwAligned)
+  void GenWriter::emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI)
   {
     Type *llvmType = llvmValues->getType();
     const ir::Type type = getType(ctx, llvmType);
     unsigned byteSize = getTypeByteSize(unit, llvmType);
-    const ir::Register ptr = this->getRegister(llvmPtr);
 
     Type *elemType = llvmType;
     unsigned elemNum = 1;
@@ -3695,13 +4032,13 @@ namespace gbe
     const ir::Tuple byteTuple = ctx.arrayTuple(&byteTupleData[0], byteSize);
 
     if (isLoad) {
-      ctx.LOAD(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, binding);
+      ctx.LOAD(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, fixedBTI, bti);
       ctx.BITCAST(type, ir::TYPE_U8, tuple, byteTuple, elemNum, byteSize);
     } else {
       ctx.BITCAST(ir::TYPE_U8, type, byteTuple, tuple, byteSize, elemNum);
       // FIXME: byte scatter does not handle correctly vector store, after fix that,
       //        we can directly use on store instruction like:
-      //        ctx.STORE(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, binding);
+      //        ctx.STORE(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, fixedBTI, bti);
       const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
       for (uint32_t elemID = 0; elemID < byteSize; elemID++) {
         const ir::Register reg = byteTupleData[elemID];
@@ -3716,7 +4053,7 @@ namespace gbe
           ctx.LOADI(ir::TYPE_S32, offset, immIndex);
           ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
         }
-       ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
+       ctx.STORE(type, addr, addrSpace, dwAligned, fixedBTI, bti, reg);
       }
     }
   }
@@ -3729,10 +4066,31 @@ namespace gbe
     Value *llvmValues = getLoadOrStoreValue(I);
     Type *llvmType = llvmValues->getType();
     const bool dwAligned = (I.getAlignment() % 4) == 0;
-    const ir::Register ptr = this->getRegister(llvmPtr);
-    ir::BTI binding;
-    gatherBTI(&I, binding);
-    const ir::AddressSpace addrSpace = btiToGen(binding);
+    ir::AddressSpace addrSpace;
+    const ir::Register pointer = this->getRegister(llvmPtr);
+    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+
+    Value *bti = getBtiRegister(llvmPtr);
+    Value *ptrBase = getPointerBase(llvmPtr);
+    ir::Register baseReg = this->getRegister(ptrBase);
+    bool zeroBase = false;
+    if (isa<ConstantPointerNull>(ptrBase)) {
+      zeroBase = true;
+    }
+
+    ir::Register btiReg;
+    bool fixedBTI = false;
+    if (isa<ConstantInt>(bti)) {
+      fixedBTI = true;
+      unsigned index = cast<ConstantInt>(bti)->getZExtValue();
+      addrSpace = btiToGen(index);
+      ir::ImmediateIndex immIndex = ctx.newImmediate((uint32_t)index);
+      btiReg = ctx.reg(ir::FAMILY_DWORD);
+      ctx.LOADI(ir::TYPE_U32, btiReg, immIndex);
+    } else {
+      addrSpace = ir::MEM_MIXED;
+      btiReg = this->getRegister(bti);
+    }
 
     Type *scalarType = llvmType;
     if (!isScalarType(llvmType)) {
@@ -3740,11 +4098,20 @@ namespace gbe
       scalarType = vectorType->getElementType();
     }
 
+    ir::Register ptr = ctx.reg(pointerFamily);
+    // FIXME: avoid subtraction zero at this stage is not a good idea,
+    // but later ArgumentLower pass need to match exact load/addImm pattern
+    // so, I avoid subtracting zero base to satisfy ArgumentLower pass.
+    if (!zeroBase)
+      ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
+    else
+      ptr = pointer;
+
     if (!dwAligned
        && (scalarType == IntegerType::get(I.getContext(), 64)
           || scalarType == IntegerType::get(I.getContext(), 32))
        ) {
-      emitUnalignedDQLoadStore(llvmPtr, llvmValues, addrSpace, binding, isLoad, dwAligned);
+      emitUnalignedDQLoadStore(ptr, llvmValues, addrSpace, btiReg, isLoad, dwAligned, fixedBTI);
       return;
     }
     // Scalar is easy. We neednot build register tuples
@@ -3752,9 +4119,9 @@ namespace gbe
       const ir::Type type = getType(ctx, llvmType);
       const ir::Register values = this->getRegister(llvmValues);
       if (isLoad)
-        ctx.LOAD(type, ptr, addrSpace, dwAligned, binding, values);
+        ctx.LOAD(type, ptr, addrSpace, dwAligned, fixedBTI, btiReg, values);
       else
-        ctx.STORE(type, ptr, addrSpace, dwAligned, binding, values);
+        ctx.STORE(type, ptr, addrSpace, dwAligned, fixedBTI, btiReg, values);
     }
     // A vector type requires to build a tuple
     else {
@@ -3776,10 +4143,9 @@ namespace gbe
       // The code is going to be fairly different from types to types (based on
       // size of each vector element)
       const ir::Type type = getType(ctx, elemType);
-      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
       const ir::RegisterFamily dataFamily = getFamily(type);
 
-      if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT && addrSpace != ir::MEM_MIXED) {
+      if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT) {
         // One message is enough here. Nothing special to do
         if (elemNum <= 4) {
           // Build the tuple data in the vector
@@ -3798,19 +4164,19 @@ namespace gbe
 
           // Emit the instruction
           if (isLoad)
-            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
+            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, fixedBTI, btiReg);
           else
-            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
+            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, fixedBTI, btiReg);
         }
         // Not supported by the hardware. So, we split the message and we use
         // strided loads and stores
         else {
-          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, btiReg, dwAligned, fixedBTI);
         }
       }
       else if((dataFamily == ir::FAMILY_WORD && (isLoad || elemNum % 2 == 0)) ||
               (dataFamily == ir::FAMILY_BYTE && (isLoad || elemNum % 4 == 0))) {
-          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, btiReg, dwAligned, fixedBTI);
       } else {
         for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
           if(regTranslator.isUndefConst(llvmValues, elemID))
@@ -3830,9 +4196,9 @@ namespace gbe
               ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
           }
           if (isLoad)
-           ctx.LOAD(type, addr, addrSpace, dwAligned, binding, reg);
+           ctx.LOAD(type, addr, addrSpace, dwAligned, fixedBTI, btiReg, reg);
           else
-           ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
+           ctx.STORE(type, addr, addrSpace, dwAligned, fixedBTI, btiReg, reg);
         }
       }
     }
-- 
2.3.6



More information about the Beignet mailing list