[Beignet] [PATCH 4/4] gbe: Implement a new BTI solution to support dynamic bti

Zhigang Gong zhigang.gong at linux.intel.com
Thu May 14 00:27:56 PDT 2015


The patch itself looks great, but really too huge thus very difficult to review.
I have some minor comments as below please check them out carefully :).

On Thu, Apr 30, 2015 at 11:49:49AM +0800, Ruiling Song wrote:
> while the old implementation analyze statically the pointer base, and thus
> manage compile time BTIs for all memory access instruction. The new implementation
> introduce a virtual register to hold the BTI value for the memory access instruction.
> The main benefit of this new method is it can handle storing/loading pointers.
> This is a big step towards supporting storing/loading pointers
> 
> consider following example:
> void @compiler_mixed_pointer1(i32 addrspace(1)* readonly %src, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2) {
>   %cmp = icmp slt i32 %add4.i, 5
>   %cond = select i1 %cmp, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2
>   store i32 %6, i32 addrspace(1)* %10, align 4, !tbaa !31
> }
> 
> will be changed to:
> 
> void @compiler_mixed_pointer1(i32 addrspace(1)* readonly %src, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2) {
>   %cmp = icmp slt i32 %add4.i, 5
> 
>   // new added instruction:
>   // %0 hold the value of BTIs, '3' is bti of dst1, '4' is the bti of dst2
>   // %1 holds the value of starting address for the BTIs, which will be subtracted.
> 
>   %0 = select i1 %cmp, i32 3, i32 4
>   %1 = select i1 %cmp, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2
> 
>   %cond = select i1 %cmp, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2
>   store i32 %cond, i32 addrspace(1)* %10, align 4
> }
> 
> The idea of the solution is: check bti register and select one lane of bti that is not accessed (through 'lzd').
> and issue the send message to the bti, and continue get the un-accessed lanes and repeat the steps.
> 
> for mixed pointer, the final asm looks like below:
> (g118 (offset 0xec0) is register holds bti of all lanes)
> ((31-lzd(active_lane_mask))*4 + bti_reg_start) is the target bti for this iteration
> 
> As the gen backend currently only allow one flag register for one selectionInstruction,
> so I have to store the flag at (54) and load at (64) at the example below.
One tricky way is to use a different flag and put the store/restore instruction
out of the while loop. For example, if we get an allocated f0.1 then use f1.1 as well,
Please refer the following instructions:

  mov(1) g121.14<2>:UW f1.1
  mov    f1.1<2>:UW  0x0UW
  ...
  while(16) -xx
  mov(1) f1.1  g121.14<0,1,0>

We may save some instructions if it iterates more than once.

> 
>     (      38)  mov(1)          f0.1<2>:UW      0x0UW                           { align1 WE_all };
>     (      40)  cmp.ne.f0.1(16) null:F          f0.1<0,1,0>:UW  0x1UW           { align1 WE_normal 1H switch };
>     (      42)  and(1)          g8.2<1>:UD      f0.1<0,1,0>:UW  0xffffffffUD    { align1 WE_all };
>     (      44)  lzd(1)          g8.2<1>:UD      g8.2<0,1,0>:UD                  { align1 WE_all };
>     (      46)  add(1)          g8.4<2>:UW      -g8.4<0,1,0>:UW 0x1fUW          { align1 WE_all };
>     (      48)  mul(1)          g8.4<2>:UW      g8.4<0,1,0>:UW  0x4UW           { align1 WE_all };
>     (      50)  add(1)          a0<2>:UW        g8.4<0,1,0>:UW  0xec0UD         { align1 WE_all };
>     (      52)  mov(1)          g8.2<1>:UD      g[a0]<0,1,0>:UD                 { align1 WE_all };
>     (      54)  mov(1)          g121.14<2>:UW   f0.1<0,1,0>:UW                  { align1 WE_all };
>     (      56)  cmp.e.f0.1(8)   null:F          g118<8,8,1>:UD  g8.2<0,1,0>:UD  { align1 WE_normal 1Q switch };
>     (      58)  cmp.e.f0.1(8)   null:F          g119<8,8,1>:UD  g8.2<0,1,0>:UD  { align1 WE_normal 2Q switch };

Why not use a UW type for the BTI register? Then we can only generate one CMP instruction under SIMD16 mode.

There are also some other comments embedded in the patch,

>     (      60)  or(1)           a0<1>:UD        g8.8<0,1,0>:UB  0x8035e00UD     { align1 WE_all };
>     (      62)  (+f0.1) send(16) null:UW        g104<8,8,1>:UD  a0<0,1,0>:UW
>                 data                                            { align1 WE_normal 1H };
>     (      64)  mov(1)          f0.1<2>:UW      g121.14<0,1,0>:UW               { align1 WE_all };
>     (      66)  (+f0.1) cmp.ne.f0.1(8) null:F   g118<8,8,1>:UD  g8.2<0,1,0>:UD  { align1 WE_normal 1Q switch };
>     (      68)  (+f0.1) cmp.ne.f0.1(8) null:F   g119<8,8,1>:UD  g8.2<0,1,0>:UD  { align1 WE_normal 2Q switch };
>     (      70)  (+f0.1) while(16) -28                                           { align1 WE_normal 1H };
> 
> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> ---
>  backend/src/backend/gen/gen_mesa_disasm.c  | 100 ++---
>  backend/src/backend/gen75_encoder.cpp      |  80 +++-
>  backend/src/backend/gen75_encoder.hpp      |   9 +-
>  backend/src/backend/gen8_context.cpp       |  49 ++-
>  backend/src/backend/gen8_encoder.cpp       |  79 +++-
>  backend/src/backend/gen8_encoder.hpp       |   9 +-
>  backend/src/backend/gen_context.cpp        | 204 +++++++++-
>  backend/src/backend/gen_context.hpp        |   2 +
>  backend/src/backend/gen_encoder.cpp        | 143 +++++--
>  backend/src/backend/gen_encoder.hpp        |  16 +-
>  backend/src/backend/gen_insn_selection.cpp | 465 +++++++++++++---------
>  backend/src/backend/gen_insn_selection.hpp |  20 +-
>  backend/src/backend/gen_register.hpp       |   2 +
>  backend/src/backend/program.h              |   1 +
>  backend/src/ir/context.hpp                 |   8 +-
>  backend/src/ir/instruction.cpp             | 109 ++++--
>  backend/src/ir/instruction.hpp             |  36 +-
>  backend/src/ir/profile.cpp                 |   4 +-
>  backend/src/ir/profile.hpp                 |   3 +-
>  backend/src/llvm/llvm_gen_backend.cpp      | 606 +++++++++++++++++++++++------
>  20 files changed, 1405 insertions(+), 540 deletions(-)
> 
> diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
> index 711b943..5f5fd3a 100644
> --- a/backend/src/backend/gen/gen_mesa_disasm.c
> +++ b/backend/src/backend/gen/gen_mesa_disasm.c
> @@ -99,8 +99,8 @@ static const struct {
>    [GEN_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
>    [GEN_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
>  
> -  [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
> -  [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
> +  [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 2, .ndst = 1 },
> +  [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 2, .ndst = 1 },
>    [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
>    [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
>    [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
> @@ -1249,59 +1249,61 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
>                       target, &space);
>      }
>  
> -    switch (target) {
> -      case GEN_SFID_SAMPLER:
> -        format(file, " (%d, %d, %d, %d)",
> -               SAMPLE_BTI(inst),
> -               SAMPLER(inst),
> -               SAMPLER_MSG_TYPE(inst),
> -               SAMPLER_SIMD_MODE(inst));
> -        break;
> -      case GEN_SFID_DATAPORT_DATA:
> -        if(UNTYPED_RW_CATEGORY(inst) == 0) {
> +    if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE) {
> +      switch (target) {
> +        case GEN_SFID_SAMPLER:
> +          format(file, " (%d, %d, %d, %d)",
> +                 SAMPLE_BTI(inst),
> +                 SAMPLER(inst),
> +                 SAMPLER_MSG_TYPE(inst),
> +                 SAMPLER_SIMD_MODE(inst));
> +          break;
> +        case GEN_SFID_DATAPORT_DATA:
> +          if(UNTYPED_RW_CATEGORY(inst) == 0) {
> +            format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
> +                   UNTYPED_RW_BTI(inst),
> +                   UNTYPED_RW_RGBA(inst),
> +                   data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
> +                   data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
> +                   data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
> +          } else {
> +            format(file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
> +                   SCRATCH_RW_OFFSET(inst),
> +                   data_port_scratch_block_size[SCRATCH_RW_BLOCK_SIZE(inst)],
> +                   data_port_scratch_invalidate[SCRATCH_RW_INVALIDATE_AFTER_READ(inst)],
> +                   data_port_scratch_channel_mode[SCRATCH_RW_CHANNEL_MODE(inst)],
> +                   data_port_scratch_msg_type[SCRATCH_RW_MSG_TYPE(inst)]);
> +          }
> +          break;
> +        case GEN_SFID_DATAPORT1_DATA:
>            format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
>                   UNTYPED_RW_BTI(inst),
>                   UNTYPED_RW_RGBA(inst),
>                   data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
>                   data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
> -                 data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
> -        } else {
> -          format(file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
> -                 SCRATCH_RW_OFFSET(inst),
> -                 data_port_scratch_block_size[SCRATCH_RW_BLOCK_SIZE(inst)],
> -                 data_port_scratch_invalidate[SCRATCH_RW_INVALIDATE_AFTER_READ(inst)],
> -                 data_port_scratch_channel_mode[SCRATCH_RW_CHANNEL_MODE(inst)],
> -                 data_port_scratch_msg_type[SCRATCH_RW_MSG_TYPE(inst)]);
> -        }
> -        break;
> -      case GEN_SFID_DATAPORT1_DATA:
> -        format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
> -               UNTYPED_RW_BTI(inst),
> -               UNTYPED_RW_RGBA(inst),
> -               data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
> -               data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
> -               data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
> -        break;
> -      case GEN_SFID_DATAPORT_CONSTANT:
> -        format(file, " (bti: %d, %s)",
> -               DWORD_RW_BTI(inst),
> -               data_port_data_cache_msg_type[DWORD_RW_MSG_TYPE(inst)]);
> -        break;
> -      case GEN_SFID_MESSAGE_GATEWAY:
> -        format(file, " (subfunc: %s, notify: %d, ackreq: %d)",
> -               gateway_sub_function[MSG_GW_SUBFUNC(inst)],
> -               MSG_GW_NOTIFY(inst),
> -               MSG_GW_ACKREQ(inst));
> -        break;
> -
> -      default:
> -        format(file, "unsupported target %d", target);
> -        break;
> +                 data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
> +          break;
> +        case GEN_SFID_DATAPORT_CONSTANT:
> +          format(file, " (bti: %d, %s)",
> +                 DWORD_RW_BTI(inst),
> +                 data_port_data_cache_msg_type[DWORD_RW_MSG_TYPE(inst)]);
> +          break;
> +        case GEN_SFID_MESSAGE_GATEWAY:
> +          format(file, " (subfunc: %s, notify: %d, ackreq: %d)",
> +                 gateway_sub_function[MSG_GW_SUBFUNC(inst)],
> +                 MSG_GW_NOTIFY(inst),
> +                 MSG_GW_ACKREQ(inst));
> +          break;
> +
> +        default:
> +          format(file, "unsupported target %d", target);
> +          break;
> +      }
> +      if (space)
> +        string(file, " ");
> +      format(file, "mlen %d", GENERIC_MSG_LENGTH(inst));
> +      format(file, " rlen %d", GENERIC_RESPONSE_LENGTH(inst));
>      }
> -    if (space)
> -      string(file, " ");
> -    format(file, "mlen %d", GENERIC_MSG_LENGTH(inst));
> -    format(file, " rlen %d", GENERIC_RESPONSE_LENGTH(inst));
>    }
>    pad(file, 64);
>    if (OPCODE(inst) != GEN_OPCODE_NOP) {

The change in the disassembly file could be splitted in another patch.

> diff --git a/backend/src/backend/gen75_encoder.cpp b/backend/src/backend/gen75_encoder.cpp
> index c77ce4d..21495af 100644
> --- a/backend/src/backend/gen75_encoder.cpp
> +++ b/backend/src/backend/gen75_encoder.cpp
> @@ -96,8 +96,7 @@ namespace gbe
>      gen7_insn->bits3.gen7_typed_rw.slot = 1;
>    }
>  
> -  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
> -    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +  unsigned Gen75Encoder::generateAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum) {
>      Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
>      uint32_t msg_length = 0;
>      uint32_t response_length = 0;
> @@ -111,11 +110,6 @@ namespace gbe
>      } else
>        NOT_IMPLEMENTED;
>  
> -    this->setHeader(insn);
> -    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> -    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> -    this->setSrc1(insn, GenRegister::immud(0));
> -
>      const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
>      setMessageDescriptor(insn, sfid, msg_length, response_length);
>      gen7_insn->bits3.gen7_atomic_op.msg_type = GEN75_P1_UNTYPED_ATOMIC_OP;
> @@ -129,11 +123,26 @@ namespace gbe
>        gen7_insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
>      else
>        NOT_SUPPORTED;
> +    return gen7_insn->bits3.ud;
>    }
>  
> -  void Gen75Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
> +  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
>      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> -    assert(elemNum >= 1 || elemNum <= 4);
> +
> +    this->setHeader(insn);
> +    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
> +
> +    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> +    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      this->setSrc1(insn, GenRegister::immud(0));
> +      generateAtomicMessageDesc(insn, function, bti.value.ud, srcNum);
> +    } else {
> +      this->setSrc1(insn, bti);
> +    }
> +  }
> +
> +  unsigned Gen75Encoder::generateUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
>      uint32_t msg_length = 0;
>      uint32_t response_length = 0;
>      if (this->curr.execWidth == 8) {
> @@ -144,44 +153,75 @@ namespace gbe
>        response_length = 2 * elemNum;
>      } else
>        NOT_IMPLEMENTED;
> -
> -    this->setHeader(insn);
> -    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
> -    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> -    this->setSrc1(insn, GenRegister::immud(0));
>      setDPUntypedRW(insn,
>                     bti,
>                     untypedRWMask[elemNum],
>                     GEN75_P1_UNTYPED_READ,
>                     msg_length,
>                     response_length);
> +    return insn->bits3.ud;
>    }
>  
> -  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
> +  void Gen75Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum) {
>      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
>      assert(elemNum >= 1 || elemNum <= 4);
> +
> +    this->setHeader(insn);
> +    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
> +    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> +    this->setSrc1(insn, GenRegister::immud(0));
> +    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
> +
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      this->setSrc1(insn, GenRegister::immud(0));
> +      generateUntypedReadMessageDesc(insn, bti.value.ud, elemNum);
> +    } else {
> +      this->setSrc1(insn, bti);
> +    }
> +  }
> +
> +  unsigned Gen75Encoder::generateUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
>      uint32_t msg_length = 0;
>      uint32_t response_length = 0;
> -    this->setHeader(insn);
>      if (this->curr.execWidth == 8) {
> -      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
>        msg_length = 1 + elemNum;
>      } else if (this->curr.execWidth == 16) {
> -      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
>        msg_length = 2 * (1 + elemNum);
>      }
>      else
>        NOT_IMPLEMENTED;
> -    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
> -    this->setSrc1(insn, GenRegister::immud(0));
>      setDPUntypedRW(insn,
>                     bti,
>                     untypedRWMask[elemNum],
>                     GEN75_P1_UNTYPED_SURFACE_WRITE,
>                     msg_length,
>                     response_length);
> +    return insn->bits3.ud;
>    }
>  
> +  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
> +    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +    assert(elemNum >= 1 || elemNum <= 4);
> +    this->setHeader(insn);
> +    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
> +    if (this->curr.execWidth == 8) {
> +      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
> +    } else if (this->curr.execWidth == 16) {
> +      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
> +    }
> +    else
> +      NOT_IMPLEMENTED;
> +    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
> +
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      this->setSrc1(insn, GenRegister::immud(0));
> +      generateUntypedWriteMessageDesc(insn, bti.value.ud, elemNum);
> +    } else {
> +      this->setSrc1(insn, bti);
> +    }
> +  }
> +
> +
>    void Gen75Encoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
>      union { double d; unsigned u[2]; } u;
>      u.d = value;
> diff --git a/backend/src/backend/gen75_encoder.hpp b/backend/src/backend/gen75_encoder.hpp
> index 9545157..31a199c 100644
> --- a/backend/src/backend/gen75_encoder.hpp
> +++ b/backend/src/backend/gen75_encoder.hpp
> @@ -48,15 +48,18 @@ namespace gbe
>      virtual int getDoubleExecWidth(void) { return GEN75_DOUBLE_EXEC_WIDTH; }
>      virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
>      virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
> -    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
> -    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
> -    virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
> +    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
> +    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
> +    virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
>      virtual void setHeader(GenNativeInstruction *insn);
>      virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
>                     uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
>      virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
>                                        unsigned char msg_type, uint32_t msg_length,
>                                        bool header_present);
> +    virtual unsigned generateAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
> +    virtual unsigned generateUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
> +    virtual unsigned generateUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
>    };
>  }
>  #endif /* __GBE_GEN75_ENCODER_HPP__ */
> diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
> index 2cdb248..e5f335e 100644
> --- a/backend/src/backend/gen8_context.cpp
> +++ b/backend/src/backend/gen8_context.cpp
> @@ -800,19 +800,35 @@ namespace gbe
>        p->pop();
>      }
>    }
> -
>    void Gen8Context::emitRead64Instruction(const SelectionInstruction &insn)
>    {
> -    const uint32_t bti = insn.getbti();
>      const uint32_t elemNum = insn.extra.elem;
>      GBE_ASSERT(elemNum == 1);
>  
> -    const GenRegister addr = ra->genReg(insn.src(0));
> -    const GenRegister tmp_dst = ra->genReg(insn.dst(0));
> +    const GenRegister dst = ra->genReg(insn.dst(0));
> +    const GenRegister src = ra->genReg(insn.src(0));
> +    const GenRegister bti = ra->genReg(insn.src(1));
>  
>      /* Because BDW's store and load send instructions for 64 bits require the bti to be surfaceless,
>         which we can not accept. We just fallback to 2 DW untyperead here. */
> -    p->UNTYPED_READ(tmp_dst, addr, bti, elemNum*2);
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      p->UNTYPED_READ(dst, src, bti, 2*elemNum);
> +    } else {
> +      const GenRegister tmp = ra->genReg(insn.dst(2*elemNum));
> +      GenNativeInstruction nativeInsn;
> +      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
> +      unsigned desc = p->generateUntypedReadMessageDesc(&nativeInsn, 0, 2*elemNum);
> +
> +      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
> +
> +      //predicated load
> +      p->push();
> +        p->curr.predicate = GEN_PREDICATE_NORMAL;
> +        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
> +        p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), 2*elemNum);
> +      p->pop();
> +      afterMessage(insn, bti, tmp, jip0);
> +    }
>  
>      for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
>        GenRegister long_tmp = ra->genReg(insn.dst(elemID));
> @@ -823,11 +839,10 @@ namespace gbe
>  
>    void Gen8Context::emitWrite64Instruction(const SelectionInstruction &insn)
>    {
> -    const uint32_t bti = insn.getbti();
>      const uint32_t elemNum = insn.extra.elem;
>      GBE_ASSERT(elemNum == 1);
> -
>      const GenRegister addr = ra->genReg(insn.src(elemNum));
> +    const GenRegister bti = ra->genReg(insn.src(elemNum*2+1));
>  
>      /* Because BDW's store and load send instructions for 64 bits require the bti to be surfaceless,
>         which we can not accept. We just fallback to 2 DW untypewrite here. */
> @@ -837,9 +852,25 @@ namespace gbe
>        this->unpackLongVec(the_long, long_tmp, p->curr.execWidth);
>      }
>  
> -    p->UNTYPED_WRITE(addr, bti, elemNum*2);
> -  }
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      p->UNTYPED_WRITE(addr, bti, elemNum*2);
> +    } else {
> +      const GenRegister tmp = ra->genReg(insn.dst(elemNum));
> +      GenNativeInstruction nativeInsn;
> +      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
> +      unsigned desc = p->generateUntypedWriteMessageDesc(&nativeInsn, 0, elemNum*2);
> +
> +      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
>  
> +      //predicated load
> +      p->push();
> +        p->curr.predicate = GEN_PREDICATE_NORMAL;
> +        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
> +        p->UNTYPED_WRITE(addr, GenRegister::addr1(0), elemNum*2);
> +      p->pop();
> +      afterMessage(insn, bti, tmp, jip0);
> +    }
> +  }
>    void Gen8Context::emitPackLongInstruction(const SelectionInstruction &insn) {
>      const GenRegister src = ra->genReg(insn.src(0));
>      const GenRegister dst = ra->genReg(insn.dst(0));
> diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
> index f02a2ca..ce6150d 100644
> --- a/backend/src/backend/gen8_encoder.cpp
> +++ b/backend/src/backend/gen8_encoder.cpp
> @@ -103,9 +103,7 @@ namespace gbe
>    void Gen8Encoder::F32TO16(GenRegister dest, GenRegister src0) {
>      MOV(GenRegister::retype(dest, GEN_TYPE_HF), GenRegister::retype(src0, GEN_TYPE_F));
>    }
> -
> -  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
> -    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +  unsigned Gen8Encoder::generateAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum) {
>      Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
>      uint32_t msg_length = 0;
>      uint32_t response_length = 0;
> @@ -119,11 +117,6 @@ namespace gbe
>      } else
>        NOT_IMPLEMENTED;
>  
> -    this->setHeader(insn);
> -    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> -    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> -    this->setSrc1(insn, GenRegister::immud(0));
> -
>      const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
>      setMessageDescriptor(insn, sfid, msg_length, response_length);
>      gen8_insn->bits3.gen7_atomic_op.msg_type = GEN75_P1_UNTYPED_ATOMIC_OP;
> @@ -137,11 +130,26 @@ namespace gbe
>        gen8_insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
>      else
>        NOT_SUPPORTED;
> +    return gen8_insn->bits3.ud;
>    }
>  
> -  void Gen8Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
> +  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
>      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> -    assert(elemNum >= 1 || elemNum <= 4);
> +
> +    this->setHeader(insn);
> +    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
> +
> +    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> +    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> +
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      this->setSrc1(insn, GenRegister::immud(0));
> +      generateAtomicMessageDesc(insn, function, bti.value.ud, srcNum);
> +    } else {
> +      this->setSrc1(insn, bti);
> +    }
> +  }
> +  unsigned Gen8Encoder::generateUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
>      uint32_t msg_length = 0;
>      uint32_t response_length = 0;
>      if (this->curr.execWidth == 8) {
> @@ -152,44 +160,73 @@ namespace gbe
>        response_length = 2 * elemNum;
>      } else
>        NOT_IMPLEMENTED;
> -
> -    this->setHeader(insn);
> -    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
> -    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> -    this->setSrc1(insn, GenRegister::immud(0));
>      setDPUntypedRW(insn,
>                     bti,
>                     untypedRWMask[elemNum],
>                     GEN75_P1_UNTYPED_READ,
>                     msg_length,
>                     response_length);
> +    return insn->bits3.ud;
>    }
>  
> -  void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
> +  void Gen8Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum) {
>      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
>      assert(elemNum >= 1 || elemNum <= 4);
> +
> +    this->setHeader(insn);
> +    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
> +    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> +    this->setSrc1(insn, GenRegister::immud(0));
> +    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
> +
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      this->setSrc1(insn, GenRegister::immud(0));
> +      generateUntypedReadMessageDesc(insn, bti.value.ud, elemNum);
> +    } else {
> +      this->setSrc1(insn, bti);
> +    }
> +  }
> +
> +  unsigned Gen8Encoder::generateUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
>      uint32_t msg_length = 0;
>      uint32_t response_length = 0;
> -    this->setHeader(insn);
>      if (this->curr.execWidth == 8) {
> -      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
>        msg_length = 1 + elemNum;
>      } else if (this->curr.execWidth == 16) {
> -      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
>        msg_length = 2 * (1 + elemNum);
>      }
>      else
>        NOT_IMPLEMENTED;
> -    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
> -    this->setSrc1(insn, GenRegister::immud(0));
>      setDPUntypedRW(insn,
>                     bti,
>                     untypedRWMask[elemNum],
>                     GEN75_P1_UNTYPED_SURFACE_WRITE,
>                     msg_length,
>                     response_length);
> +    return insn->bits3.ud;
>    }
>  
> +  void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
> +    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +    assert(elemNum >= 1 || elemNum <= 4);
> +    this->setHeader(insn);
> +    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
> +    if (this->curr.execWidth == 8) {
> +      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
> +    } else if (this->curr.execWidth == 16) {
> +      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
> +    }
> +    else
> +      NOT_IMPLEMENTED;
> +    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
> +
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      this->setSrc1(insn, GenRegister::immud(0));
> +      generateUntypedWriteMessageDesc(insn, bti.value.ud, elemNum);
> +    } else {
> +      this->setSrc1(insn, bti);
> +    }
> +  }
>    void Gen8Encoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
>      union { double d; unsigned u[2]; } u;
>      u.d = value;
> diff --git a/backend/src/backend/gen8_encoder.hpp b/backend/src/backend/gen8_encoder.hpp
> index 4c5e556..37faf25 100644
> --- a/backend/src/backend/gen8_encoder.hpp
> +++ b/backend/src/backend/gen8_encoder.hpp
> @@ -49,9 +49,9 @@ namespace gbe
>      virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
>      virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
>      virtual void LOAD_INT64_IMM(GenRegister dest, GenRegister value);
> -    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
> -    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
> -    virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
> +    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
> +    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
> +    virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
>      virtual void setHeader(GenNativeInstruction *insn);
>      virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
>                     uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
> @@ -66,6 +66,9 @@ namespace gbe
>                         GenRegister src0, GenRegister src1, GenRegister src2);
>      virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0,
>                              GenRegister src1 = GenRegister::null());
> +    virtual unsigned generateAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
> +    virtual unsigned generateUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
> +    virtual unsigned generateUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
For me, it's a little bit cleaner to define two different set of the above functions. One set is to really modify the insn
and the other is just generate correct immediate value and don't need to pass in a dummy insn.


>    };
>  }
>  #endif /* __GBE_GEN8_ENCODER_HPP__ */
> diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
> index 62fd596..c389963 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -205,7 +205,8 @@ namespace gbe
>        p->curr.execWidth = 1;
>        p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
>        p->curr.execWidth = this->simdWidth;
> -      p->ADD(stackptr, stackptr, bufferptr);
> +      // let private address start from zero
> +      p->ADD(stackptr, stackptr, GenRegister::immud(0));
The above instruction looks useless, we don't need to generate it.
And as now private address start from zero, we could remove bufferptr
related code as well.

>        p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
>      p->pop();
>    }
> @@ -1689,9 +1690,27 @@ namespace gbe
>      const GenRegister src = ra->genReg(insn.src(0));
>      const GenRegister dst = ra->genReg(insn.dst(0));
>      const uint32_t function = insn.extra.function;
> -    const uint32_t bti = insn.getbti();
> +    unsigned srcNum = insn.extra.elem;
> +
> +    const GenRegister bti = ra->genReg(insn.src(srcNum));
> +
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      p->ATOMIC(dst, function, src, bti, srcNum);
> +    } else {
> +      GenRegister flagTemp = ra->genReg(insn.dst(1));
> +
> +      GenNativeInstruction nativeInsn;
> +      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
> +      unsigned desc = p->generateAtomicMessageDesc(&nativeInsn, function, 0, srcNum);
>  
> -    p->ATOMIC(dst, function, src, bti, insn.srcNum);
> +      unsigned jip0 = beforeMessage(insn, bti, flagTemp, desc);
> +      p->push();
> +        p->curr.predicate = GEN_PREDICATE_NORMAL;
> +        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
> +        p->ATOMIC(dst, function, src, GenRegister::addr1(0), srcNum);
> +      p->pop();
> +      afterMessage(insn, bti, flagTemp, jip0);
> +    }
>    }
>  
>    void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
> @@ -1811,48 +1830,200 @@ namespace gbe
>    }
>  
>    void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
> -    const uint32_t elemNum = insn.extra.elem;
> +    const uint32_t elemNum = insn.extra.elem * 2;
>      const GenRegister dst = ra->genReg(insn.dst(0));
>      const GenRegister src = ra->genReg(insn.src(0));
> -    const uint32_t bti = insn.getbti();
> -    p->UNTYPED_READ(dst, src, bti, elemNum*2);
> +    const GenRegister bti = ra->genReg(insn.src(1));
> +
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      p->UNTYPED_READ(dst, src, bti, elemNum);
> +    } else {
> +      const GenRegister tmp = ra->genReg(insn.dst(elemNum));
> +      GenNativeInstruction nativeInsn;
> +      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
> +      unsigned desc = p->generateUntypedReadMessageDesc(&nativeInsn, 0, elemNum);
> +
> +      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
> +
> +      //predicated load
> +      p->push();
> +        p->curr.predicate = GEN_PREDICATE_NORMAL;
> +        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
> +        p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), elemNum);
> +      p->pop();
> +      afterMessage(insn, bti, tmp, jip0);
> +    }
> +  }
> +  unsigned GenContext::beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister tmp, unsigned desc) {
> +      const GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
> +      setFlag(flagReg, GenRegister::immuw(0));
> +      p->CMP(GEN_CONDITIONAL_NZ, flagReg, GenRegister::immuw(1));
> +
> +      GenRegister btiUD = ra->genReg(GenRegister::ud1grf(ir::ocl::btiUtil));
> +      GenRegister btiUW = ra->genReg(GenRegister::uw1grf(ir::ocl::btiUtil));
> +      GenRegister btiUB = ra->genReg(GenRegister::ub1grf(ir::ocl::btiUtil));
> +      unsigned jip0 = p->n_instruction();
> +      p->push();
> +        p->curr.execWidth = 1;
> +        p->curr.noMask = 1;
> +        p->AND(btiUD, flagReg, GenRegister::immud(0xffffffff));
> +        p->LZD(btiUD, btiUD);
> +        p->ADD(btiUW, GenRegister::negate(btiUW), GenRegister::immuw(0x1f));
> +        p->MUL(btiUW, btiUW, GenRegister::immuw(0x4));
> +        p->ADD(GenRegister::addr1(0), btiUW, GenRegister::immud(bti.nr*32));
> +        p->MOV(btiUD, GenRegister::indirect(GEN_TYPE_UD, 0, GEN_WIDTH_1));
> +        //save flag
> +        p->MOV(tmp, flagReg);
> +      p->pop();
> +
> +      p->CMP(GEN_CONDITIONAL_Z, bti, btiUD);
> +      p->push();
> +        p->curr.execWidth = 1;
> +        p->curr.noMask = 1;
> +        p->OR(GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), btiUB, GenRegister::immud(desc));
> +      p->pop();
> +      return jip0;
> +  }
> +  void GenContext::afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister tmp, unsigned jip0) {
> +    const GenRegister btiUD = ra->genReg(GenRegister::ud1grf(ir::ocl::btiUtil));
> +      //restore flag
> +      setFlag(GenRegister::flag(insn.state.flag, insn.state.subFlag), tmp);
> +      // get active channel
> +      p->push();
> +        p->curr.predicate = GEN_PREDICATE_NORMAL;
> +        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
> +        p->CMP(GEN_CONDITIONAL_NZ, bti, btiUD);
> +        unsigned jip1 = p->n_instruction();
> +        p->WHILE(GenRegister::immud(0));
> +      p->pop();
> +      p->patchJMPI(jip1, jip0 - jip1, 0);
>    }
>  
>    void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
>      const GenRegister dst = ra->genReg(insn.dst(0));
>      const GenRegister src = ra->genReg(insn.src(0));
> -    const uint32_t bti = insn.getbti();
> +    const GenRegister bti = ra->genReg(insn.src(1));
> +
>      const uint32_t elemNum = insn.extra.elem;
> -    p->UNTYPED_READ(dst, src, bti, elemNum);
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      p->UNTYPED_READ(dst, src, bti, elemNum);
> +    } else {
> +      const GenRegister tmp = ra->genReg(insn.dst(elemNum));
> +      GenNativeInstruction nativeInsn;
> +      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
> +      unsigned desc = p->generateUntypedReadMessageDesc(&nativeInsn, 0, elemNum);
> +
> +      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
> +
> +      //predicated load
> +      p->push();
> +        p->curr.predicate = GEN_PREDICATE_NORMAL;
> +        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
> +        p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), elemNum);
> +      p->pop();
> +      afterMessage(insn, bti, tmp, jip0);
> +    }
>    }
>  
>    void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
>      const GenRegister src = ra->genReg(insn.dst(0));
>      const uint32_t elemNum = insn.extra.elem;
> -    const uint32_t bti = insn.getbti();
> -    p->UNTYPED_WRITE(src, bti, elemNum*2);
> +    const GenRegister bti = ra->genReg(insn.src(elemNum+1));
> +
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      p->UNTYPED_WRITE(src, bti, elemNum*2);
> +    } else {
> +      const GenRegister tmp = ra->genReg(insn.dst(0));
> +      GenNativeInstruction nativeInsn;
> +      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
> +      unsigned desc = p->generateUntypedWriteMessageDesc(&nativeInsn, 0, elemNum*2);
> +
> +      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
> +
> +      //predicated load
> +      p->push();
> +        p->curr.predicate = GEN_PREDICATE_NORMAL;
> +        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
> +        p->UNTYPED_WRITE(src, GenRegister::addr1(0), elemNum*2);
> +      p->pop();
> +      afterMessage(insn, bti, tmp, jip0);
> +    }
>    }
>  
>    void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
>      const GenRegister src = ra->genReg(insn.src(0));
> -    const uint32_t bti = insn.getbti();
>      const uint32_t elemNum = insn.extra.elem;
> -    p->UNTYPED_WRITE(src, bti, elemNum);
> +    const GenRegister bti = ra->genReg(insn.src(elemNum+1));
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      p->UNTYPED_WRITE(src, bti, elemNum);
> +    } else {
> +      const GenRegister tmp = ra->genReg(insn.dst(0));
> +      GenNativeInstruction nativeInsn;
> +      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
> +      unsigned desc = p->generateUntypedWriteMessageDesc(&nativeInsn, 0, elemNum);
> +
> +      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
> +
> +      //predicated load
> +      p->push();
> +        p->curr.predicate = GEN_PREDICATE_NORMAL;
> +        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
> +        p->UNTYPED_WRITE(src, GenRegister::addr1(0), elemNum);
> +      p->pop();
> +      afterMessage(insn, bti, tmp, jip0);
> +    }
>    }
>  
>    void GenContext::emitByteGatherInstruction(const SelectionInstruction &insn) {
>      const GenRegister dst = ra->genReg(insn.dst(0));
>      const GenRegister src = ra->genReg(insn.src(0));
> -    const uint32_t bti = insn.getbti();
> +    const GenRegister bti = ra->genReg(insn.src(1));
>      const uint32_t elemSize = insn.extra.elem;
> -    p->BYTE_GATHER(dst, src, bti, elemSize);
> +
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      p->BYTE_GATHER(dst, src, bti, elemSize);
> +    } else {
> +      const GenRegister tmp = ra->genReg(insn.dst(1));
> +      GenNativeInstruction nativeInsn;
> +      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
> +      unsigned desc = p->generateByteGatherMessageDesc(&nativeInsn, 0, elemSize);
> +
> +      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
> +
> +      //predicated load
> +      p->push();
> +        p->curr.predicate = GEN_PREDICATE_NORMAL;
> +        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
> +        p->BYTE_GATHER(dst, src, GenRegister::addr1(0), elemSize);
> +      p->pop();
> +      afterMessage(insn, bti, tmp, jip0);
> +    }
>    }
>  
>    void GenContext::emitByteScatterInstruction(const SelectionInstruction &insn) {
>      const GenRegister src = ra->genReg(insn.src(0));
> -    const uint32_t bti = insn.getbti();
>      const uint32_t elemSize = insn.extra.elem;
> -    p->BYTE_SCATTER(src, bti, elemSize);
> +    const GenRegister bti = ra->genReg(insn.src(2));
> +
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      p->BYTE_SCATTER(src, bti, elemSize);
> +    } else {
> +      const GenRegister tmp = ra->genReg(insn.dst(0));
> +      GenNativeInstruction nativeInsn;
> +      memset(&nativeInsn, 0, sizeof(GenNativeInstruction));
> +      unsigned desc = p->generateByteScatterMessageDesc(&nativeInsn, 0, elemSize);
> +
> +      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
> +
> +      //predicated load
> +      p->push();
> +        p->curr.predicate = GEN_PREDICATE_NORMAL;
> +        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
> +        p->BYTE_SCATTER(src, GenRegister::addr1(0), elemSize);
> +      p->pop();
> +      afterMessage(insn, bti, tmp, jip0);
> +    }
> +
>    }
>  
>    void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) {
> @@ -1988,6 +2159,7 @@ namespace gbe
>      allocCurbeReg(lid2, GBE_CURBE_LOCAL_ID_Z);
>      allocCurbeReg(zero, GBE_CURBE_ZERO);
>      allocCurbeReg(one, GBE_CURBE_ONE);
> +    allocCurbeReg(btiUtil, GBE_CURBE_BTI_UTIL);
>      if (stackUse.size() != 0)
>        allocCurbeReg(stackbuffer, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
>      // Go over the arguments and find the related patch locations
> diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
> index 560248a..a85657c 100644
> --- a/backend/src/backend/gen_context.hpp
> +++ b/backend/src/backend/gen_context.hpp
> @@ -169,6 +169,8 @@ namespace gbe
>      virtual void emitI64DIVREMInstruction(const SelectionInstruction &insn);
>      void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
>      void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
> +    unsigned beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, unsigned desc);
> +    void afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, unsigned jip0);
>  
>      /*! Implements base class */
>      virtual Kernel *allocateKernel(void);
> diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
> index 5aa8c5c..7f2d464 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -329,10 +329,7 @@ namespace gbe
>      GEN_UNTYPED_ALPHA,
>      0
>    };
> -
> -  void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
> -    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> -    assert(elemNum >= 1 || elemNum <= 4);
> +  unsigned GenEncoder::generateUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
>      uint32_t msg_length = 0;
>      uint32_t response_length = 0;
>      if (this->curr.execWidth == 8) {
> @@ -340,49 +337,75 @@ namespace gbe
>        response_length = elemNum;
>      } else if (this->curr.execWidth == 16) {
>        msg_length = 2;
> -      response_length = 2*elemNum;
> +      response_length = 2 * elemNum;
>      } else
>        NOT_IMPLEMENTED;
> -
> -    this->setHeader(insn);
> -    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
> -    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> -    this->setSrc1(insn, GenRegister::immud(0));
>      setDPUntypedRW(insn,
>                     bti,
>                     untypedRWMask[elemNum],
>                     GEN7_UNTYPED_READ,
>                     msg_length,
>                     response_length);
> +    return insn->bits3.ud;
>    }
>  
> -  void GenEncoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
> +  void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum) {
>      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
>      assert(elemNum >= 1 || elemNum <= 4);
> +
> +    this->setHeader(insn);
> +    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
> +    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> +    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
> +
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      this->setSrc1(insn, GenRegister::immud(0));
> +      generateUntypedReadMessageDesc(insn, bti.value.ud, elemNum);
> +    } else {
> +      this->setSrc1(insn, bti);
> +    }
> +  }
> +
> +  unsigned GenEncoder::generateUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
>      uint32_t msg_length = 0;
>      uint32_t response_length = 0;
> -    this->setHeader(insn);
>      if (this->curr.execWidth == 8) {
> -      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
> -      msg_length = 1+elemNum;
> +      msg_length = 1 + elemNum;
>      } else if (this->curr.execWidth == 16) {
> -      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
> -      msg_length = 2*(1+elemNum);
> +      msg_length = 2 * (1 + elemNum);
>      }
>      else
>        NOT_IMPLEMENTED;
> -    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
> -    this->setSrc1(insn, GenRegister::immud(0));
>      setDPUntypedRW(insn,
>                     bti,
>                     untypedRWMask[elemNum],
>                     GEN7_UNTYPED_WRITE,
>                     msg_length,
>                     response_length);
> +    return insn->bits3.ud;
>    }
>  
> -  void GenEncoder::BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize) {
> +  void GenEncoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
>      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +    assert(elemNum >= 1 || elemNum <= 4);
> +    this->setHeader(insn);
> +    if (this->curr.execWidth == 8) {
> +      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
> +    } else if (this->curr.execWidth == 16) {
> +      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
> +    }
> +    else
> +      NOT_IMPLEMENTED;
> +    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
> +    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      this->setSrc1(insn, GenRegister::immud(0));
> +      generateUntypedWriteMessageDesc(insn, bti.value.ud, elemNum);
> +    } else {
> +      this->setSrc1(insn, bti);
> +    }
> +  }
> +  unsigned GenEncoder::generateByteGatherMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize) {
>      uint32_t msg_length = 0;
>      uint32_t response_length = 0;
>      if (this->curr.execWidth == 8) {
> @@ -393,11 +416,6 @@ namespace gbe
>        response_length = 2;
>      } else
>        NOT_IMPLEMENTED;
> -
> -    this->setHeader(insn);
> -    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> -    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> -    this->setSrc1(insn, GenRegister::immud(0));
>      setDPByteScatterGather(this,
>                             insn,
>                             bti,
> @@ -405,23 +423,35 @@ namespace gbe
>                             GEN7_BYTE_GATHER,
>                             msg_length,
>                             response_length);
> +    return insn->bits3.ud;
> +
>    }
>  
> -  void GenEncoder::BYTE_SCATTER(GenRegister msg, uint32_t bti, uint32_t elemSize) {
> +  void GenEncoder::BYTE_GATHER(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemSize) {
>      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +    this->setHeader(insn);
> +    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
> +
> +    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> +    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> +
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      this->setSrc1(insn, GenRegister::immud(0));
> +      generateByteGatherMessageDesc(insn, bti.value.ud, elemSize);
> +    } else {
> +      this->setSrc1(insn, bti);
> +    }
> +  }
> +  unsigned GenEncoder::generateByteScatterMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize) {
>      uint32_t msg_length = 0;
>      uint32_t response_length = 0;
> -    this->setHeader(insn);
>      if (this->curr.execWidth == 8) {
> -      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
>        msg_length = 2;
>      } else if (this->curr.execWidth == 16) {
> -      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
>        msg_length = 4;
>      } else
>        NOT_IMPLEMENTED;
> -    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
> -    this->setSrc1(insn, GenRegister::immud(0));
> +
>      setDPByteScatterGather(this,
>                             insn,
>                             bti,
> @@ -429,6 +459,30 @@ namespace gbe
>                             GEN7_BYTE_SCATTER,
>                             msg_length,
>                             response_length);
> +    return insn->bits3.ud;
> +  }
> +
> +  void GenEncoder::BYTE_SCATTER(GenRegister msg, GenRegister bti, uint32_t elemSize) {
> +    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +
> +    this->setHeader(insn);
> +    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
> +
> +    if (this->curr.execWidth == 8) {
> +      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
> +    } else if (this->curr.execWidth == 16) {
> +      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
> +    } else
> +      NOT_IMPLEMENTED;
> +
> +    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
> +
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      this->setSrc1(insn, GenRegister::immud(0));
> +      generateByteScatterMessageDesc(insn, bti.value.ud, elemSize);
> +    } else {
> +      this->setSrc1(insn, bti);
> +    }
>    }
>  
>    void GenEncoder::DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti) {
> @@ -460,9 +514,7 @@ namespace gbe
>                             response_length);
>  
>    }
> -
> -  void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
> -    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +  unsigned GenEncoder::generateAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum) {
>      uint32_t msg_length = 0;
>      uint32_t response_length = 0;
>  
> @@ -470,16 +522,11 @@ namespace gbe
>        msg_length = srcNum;
>        response_length = 1;
>      } else if (this->curr.execWidth == 16) {
> -      msg_length = 2*srcNum;
> +      msg_length = 2 * srcNum;
>        response_length = 2;
>      } else
>        NOT_IMPLEMENTED;
>  
> -    this->setHeader(insn);
> -    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> -    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> -    this->setSrc1(insn, GenRegister::immud(0));
> -
>      const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
>      setMessageDescriptor(insn, sfid, msg_length, response_length);
>      insn->bits3.gen7_atomic_op.msg_type = GEN7_UNTYPED_ATOMIC_READ;
> @@ -493,7 +540,23 @@ namespace gbe
>        insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
>      else
>        NOT_SUPPORTED;
> +    return insn->bits3.ud;
> +  }
>  
> +  void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
> +    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +
> +    this->setHeader(insn);
> +    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
> +
> +    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> +    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      this->setSrc1(insn, GenRegister::immud(0));
> +      generateAtomicMessageDesc(insn, function, bti.value.ud, srcNum);
> +    } else {
> +      this->setSrc1(insn, bti);
> +    }
>    }
>    GenCompactInstruction *GenEncoder::nextCompact(uint32_t opcode) {
>      GenCompactInstruction insn;
> @@ -893,6 +956,8 @@ namespace gbe
>    ALU2_BRA(BRD)
>    ALU2_BRA(BRC)
>  
> +  // jip is the distance between jump instruction and jump-target. we have handled
> +  // pre/post-increment in patchJMPI() function body
>    void GenEncoder::patchJMPI(uint32_t insnID, int32_t jip, int32_t uip) {
>      GenNativeInstruction &insn = *(GenNativeInstruction *)&this->store[insnID];
>      GBE_ASSERT(insnID < this->store.size());
> diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
> index 21faabc..f016e91 100644
> --- a/backend/src/backend/gen_encoder.hpp
> +++ b/backend/src/backend/gen_encoder.hpp
> @@ -169,15 +169,15 @@ namespace gbe
>      /*! Wait instruction (used for the barrier) */
>      void WAIT(void);
>      /*! Atomic instructions */
> -    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
> +    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
>      /*! Untyped read (upto 4 channels) */
> -    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
> +    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
>      /*! Untyped write (upto 4 channels) */
> -    virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
> +    virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
>      /*! Byte gather (for unaligned bytes, shorts and ints) */
> -    void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize);
> +    void BYTE_GATHER(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemSize);
>      /*! Byte scatter (for unaligned bytes, shorts and ints) */
> -    void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
> +    void BYTE_SCATTER(GenRegister src, GenRegister bti, uint32_t elemSize);
>      /*! DWord gather (for constant cache read) */
>      void DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti);
>      /*! for scratch memory read */
> @@ -230,6 +230,12 @@ namespace gbe
>      void setMessageDescriptor(GenNativeInstruction *inst, enum GenMessageTarget sfid,
>                                unsigned msg_length, unsigned response_length,
>                                bool header_present = false, bool end_of_thread = false);
> +    virtual unsigned generateAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
> +    virtual unsigned generateUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
> +    virtual unsigned generateUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
> +    unsigned generateByteGatherMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
> +    unsigned generateByteScatterMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
> +
>      virtual void setHeader(GenNativeInstruction *insn) = 0;
>      virtual void setDst(GenNativeInstruction *insn, GenRegister dest) = 0;
>      virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg) = 0;
> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index 19a3c24..367dcdb 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -595,19 +595,19 @@ namespace gbe
>      /*! Wait instruction (used for the barrier) */
>      void WAIT(void);
>      /*! Atomic instruction */
> -    void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
> +    void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, GenRegister bti, GenRegister *flagTemp);
>      /*! Read 64 bits float/int array */
> -    void READ64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum, uint32_t bti, bool native_long);
> +    void READ64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum, const GenRegister bti, bool native_long, GenRegister *flagTemp);
>      /*! Write 64 bits float/int array */
> -    void WRITE64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum, uint32_t bti, bool native_long);
> +    void WRITE64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum, GenRegister bti, bool native_long, GenRegister *flagTemp);
>      /*! Untyped read (up to 4 elements) */
> -    void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
> +    void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, GenRegister bti, GenRegister *flagTemp);
>      /*! Untyped write (up to 4 elements) */
> -    void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
> +    void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, GenRegister bti, GenRegister *flagTemp);
>      /*! Byte gather (for unaligned bytes, shorts and ints) */
> -    void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti);
> +    void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp);
>      /*! Byte scatter (for unaligned bytes, shorts and ints) */
> -    void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti);
> +    void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp);
>      /*! DWord scatter (for constant cache read) */
>      void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
>      /*! Unpack the uint to charN */
> @@ -1197,16 +1197,26 @@ namespace gbe
>  
>    void Selection::Opaque::ATOMIC(Reg dst, uint32_t function,
>                                       uint32_t srcNum, Reg src0,
> -                                     Reg src1, Reg src2, uint32_t bti) {
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, 1, srcNum);
> +                                     Reg src1, Reg src2, GenRegister bti, GenRegister *flagTemp) {
> +    unsigned dstNum = flagTemp == NULL ? 1 : 2;
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, dstNum, srcNum + 1);
> +
> +    if (bti.file != GEN_IMMEDIATE_VALUE) {
> +      insn->state.flag = 0;
> +      insn->state.subFlag = 1;
> +    }
> +
>      insn->dst(0) = dst;
> +    if(flagTemp) insn->dst(1) = *flagTemp;
> +
>      insn->src(0) = src0;
>      if(srcNum > 1) insn->src(1) = src1;
>      if(srcNum > 2) insn->src(2) = src2;
> +    insn->src(srcNum) = bti;
>      insn->extra.function = function;
> -    insn->setbti(bti);
> -    SelectionVector *vector = this->appendVector();
> +    insn->extra.elem = srcNum;
>  
> +    SelectionVector *vector = this->appendVector();
>      vector->regNum = srcNum;
>      vector->reg = &insn->src(0);
>      vector->isSrc = 1;
> @@ -1220,22 +1230,29 @@ namespace gbe
>                                   const GenRegister *dst,
>                                   const GenRegister *tmp,
>                                   uint32_t elemNum,
> -                                 uint32_t bti,
> -                                 bool native_long)
> +                                 const GenRegister bti,
> +                                 bool native_long,
> +                                 GenRegister *flagTemp)
>    {
>      SelectionInstruction *insn = NULL;
>      SelectionVector *srcVector = NULL;
>      SelectionVector *dstVector = NULL;
>  
>      if (!native_long) {
> -      insn = this->appendInsn(SEL_OP_READ64, elemNum, 1);
> +      unsigned dstNum = flagTemp == NULL ? elemNum : elemNum+1;
> +      insn = this->appendInsn(SEL_OP_READ64, dstNum, 2);
>        srcVector = this->appendVector();
>        dstVector = this->appendVector();
>        // Regular instruction to encode
>        for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
>          insn->dst(elemID) = dst[elemID];
> +
> +      // flagTemp don't need to be put in SelectionVector
> +      if (flagTemp)
> +        insn->dst(elemNum) = *flagTemp;
>      } else {
> -      insn = this->appendInsn(SEL_OP_READ64, elemNum*2, 1);
> +      unsigned dstNum = flagTemp == NULL ? elemNum*2 : elemNum*2+1;
> +      insn = this->appendInsn(SEL_OP_READ64, dstNum, 2);
>        srcVector = this->appendVector();
>        dstVector = this->appendVector();
>  
> @@ -1244,10 +1261,20 @@ namespace gbe
>  
>        for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
>          insn->dst(elemID + elemNum) = dst[elemID];
> +
> +      // flagTemp don't need to be put in SelectionVector
> +      if (flagTemp)
> +        insn->dst(2*elemNum) = *flagTemp;
> +    }
> +
> +    if (bti.file != GEN_IMMEDIATE_VALUE) {
> +      insn->state.flag = 0;
> +      insn->state.subFlag = 1;
>      }
>  
>      insn->src(0) = addr;
> -    insn->setbti(bti);
> +    insn->src(1) = bti;
> +
>      insn->extra.elem = elemNum;
>  
>      dstVector->regNum = elemNum;
> @@ -1262,9 +1289,11 @@ namespace gbe
>    void Selection::Opaque::UNTYPED_READ(Reg addr,
>                                         const GenRegister *dst,
>                                         uint32_t elemNum,
> -                                       uint32_t bti)
> +                                       GenRegister bti,
> +                                       GenRegister *flagTemp)
>    {
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, elemNum, 1);
> +    unsigned dstNum = flagTemp == NULL ? elemNum : elemNum+1;
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, dstNum, 2);
>      SelectionVector *srcVector = this->appendVector();
>      SelectionVector *dstVector = this->appendVector();
>      if (this->isScalarReg(dst[0].reg()))
> @@ -1272,8 +1301,16 @@ namespace gbe
>      // Regular instruction to encode
>      for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
>        insn->dst(elemID) = dst[elemID];
> +    if (flagTemp)
> +      insn->dst(elemNum) = *flagTemp;
> +
>      insn->src(0) = addr;
> -    insn->setbti(bti);
> +    insn->src(1) = bti;
> +    if (bti.file != GEN_IMMEDIATE_VALUE) {
> +      insn->state.flag = 0;
> +      insn->state.subFlag = 1;
> +    }
> +
>      insn->extra.elem = elemNum;
>  
>      // Sends require contiguous allocation
> @@ -1290,31 +1327,40 @@ namespace gbe
>                                    const GenRegister *src,
>                                    const GenRegister *tmp,
>                                    uint32_t srcNum,
> -                                  uint32_t bti,
> -                                  bool native_long)
> +                                  GenRegister bti,
> +                                  bool native_long,
> +                                  GenRegister *flagTemp)
>    {
>      SelectionVector *vector = NULL;
>      SelectionInstruction *insn = NULL;
>  
>      if (!native_long) {
> -      insn = this->appendInsn(SEL_OP_WRITE64, 0, srcNum + 1);
> +      unsigned dstNum = flagTemp == NULL ? 0 : 1;
> +      insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum + 2);
>        vector = this->appendVector();
> -      // Regular instruction to encode
> +      // Register layout:
> +      // dst: (flagTemp)
> +      // src: addr, srcNum, bti
>        insn->src(0) = addr;
>        for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
>          insn->src(elemID + 1) = src[elemID];
>  
> -      insn->setbti(bti);
> +      insn->src(srcNum+1) = bti;
> +      if (flagTemp)
> +        insn->dst(0) = *flagTemp;
>        insn->extra.elem = srcNum;
>  
>        vector->regNum = srcNum + 1;
>        vector->reg = &insn->src(0);
>        vector->isSrc = 1;
>      } else { // handle the native long case
> -      insn = this->appendInsn(SEL_OP_WRITE64, srcNum, srcNum*2 + 1);
> +      unsigned dstNum = flagTemp == NULL ? srcNum : srcNum+1;
> +      // Register layout:
> +      // dst: srcNum, (flagTemp)
> +      // src: srcNum, addr, srcNum, bti.
> +      insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum*2 + 2);
>        vector = this->appendVector();
>  
> -      insn->src(0) = addr;
>        for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
>          insn->src(elemID) = src[elemID];
>  
> @@ -1322,33 +1368,50 @@ namespace gbe
>        for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
>          insn->src(srcNum + 1 + elemID) = tmp[0];
>  
> +      insn->src(srcNum*2+1) = bti;
>        /* We also need to add the tmp reigster to dst, in order
>           to avoid the post schedule error . */
>        for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
>          insn->dst(elemID) = tmp[0];
>  
> -      insn->setbti(bti);
> +      if (flagTemp)
> +        insn->dst(srcNum) = *flagTemp;
>        insn->extra.elem = srcNum;
>  
>        vector->regNum = srcNum + 1;
>        vector->reg = &insn->src(srcNum);
>        vector->isSrc = 1;
>      }
> +
> +    if (bti.file != GEN_IMMEDIATE_VALUE) {
> +      insn->state.flag = 0;
> +      insn->state.subFlag = 1;
> +    }
>    }
>  
>    void Selection::Opaque::UNTYPED_WRITE(Reg addr,
>                                          const GenRegister *src,
>                                          uint32_t elemNum,
> -                                        uint32_t bti)
> +                                        GenRegister bti,
> +                                        GenRegister *flagTemp)
>    {
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, 0, elemNum+1);
> +    unsigned dstNum = flagTemp == NULL ? 0 : 1;
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, dstNum, elemNum+2);
>      SelectionVector *vector = this->appendVector();
>  
> +    if (bti.file != GEN_IMMEDIATE_VALUE) {
> +      insn->state.flag = 0;
> +      insn->state.subFlag = 1;
> +    }
> +
> +    if (flagTemp) insn->dst(0) = *flagTemp;
>      // Regular instruction to encode
>      insn->src(0) = addr;
>      for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
>        insn->src(elemID+1) = src[elemID];
> -    insn->setbti(bti);
> +    insn->src(elemNum+1) = bti;
> +    if (flagTemp)
> +      insn->src(elemNum+2) = *flagTemp;
>      insn->extra.elem = elemNum;
>  
>      // Sends require contiguous allocation for the sources
> @@ -1357,17 +1420,26 @@ namespace gbe
>      vector->isSrc = 1;
>    }
>  
> -  void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti) {
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHER, 1, 1);
> +  void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp) {
> +    unsigned dstNum = flagTemp == NULL ? 1 : 2;
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHER, dstNum, 2);
>      SelectionVector *srcVector = this->appendVector();
>      SelectionVector *dstVector = this->appendVector();
>  
> +    if (bti.file != GEN_IMMEDIATE_VALUE) {
> +      insn->state.flag = 0;
> +      insn->state.subFlag = 1;
> +    }
> +
>      if (this->isScalarReg(dst.reg()))
>        insn->state.noMask = 1;
>      // Instruction to encode
>      insn->src(0) = addr;
> +    insn->src(1) = bti;
>      insn->dst(0) = dst;
> -    insn->setbti(bti);
> +    if (flagTemp)
> +      insn->dst(1) = *flagTemp;
> +
>      insn->extra.elem = elemSize;
>  
>      // byte gather requires vector in the sense that scalar are not allowed
> @@ -1380,14 +1452,22 @@ namespace gbe
>      srcVector->reg = &insn->src(0);
>    }
>  
> -  void Selection::Opaque::BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti) {
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, 0, 2);
> +  void Selection::Opaque::BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp) {
> +    unsigned dstNum = flagTemp == NULL ? 0 : 1;
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, dstNum, 3);
>      SelectionVector *vector = this->appendVector();
>  
> +    if (bti.file != GEN_IMMEDIATE_VALUE) {
> +      insn->state.flag = 0;
> +      insn->state.subFlag = 1;
> +    }
> +
> +    if (flagTemp)
> +      insn->dst(0) = *flagTemp;
>      // Instruction to encode
>      insn->src(0) = addr;
>      insn->src(1) = src;
> -    insn->setbti(bti);
> +    insn->src(2) = bti;
>      insn->extra.elem = elemSize;
>  
>      // value and address are contiguous in the send
> @@ -3096,34 +3176,31 @@ namespace gbe
>      }
>    }
>  
> -  /*! Load instruction pattern */
> -  DECL_PATTERN(LoadInstruction)
> +  static void markAllChildrenExceptBTI(SelectionDAG &dag) {
> +    // Do not merge anything, so all sources become roots
> +    for (uint32_t childID = 1; childID < dag.childNum; ++childID)
> +      if (dag.child[childID])
> +        dag.child[childID]->isRoot = 1;
> +  }
This routine is a little bit hacky for me. Just mark the fixed BTI as root as well,
and add some patterns to handle it in insn selction stage is much easier to
understand.

> +
> +  class LoadInstructionPattern : public SelectionPattern
>    {
> +  public:
> +    /*! Register the pattern for all opcodes of the family */
> +    LoadInstructionPattern(void) : SelectionPattern(1, 1) {
> +       this->opcodes.push_back(ir::OP_LOAD);
> +    }
>      void readDWord(Selection::Opaque &sel,
>                     vector<GenRegister> &dst,
> -                   vector<GenRegister> &dst2,
>                     GenRegister addr,
>                     uint32_t valueNum,
>                     ir::BTI bti) const
>      {
> -      for (uint32_t x = 0; x < bti.count; x++) {
> -        if(x > 0)
> -          for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
> -            dst2[dstID] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
> -
> -        GenRegister temp = getRelativeAddress(sel, addr, bti.bti[x]);
> -        sel.UNTYPED_READ(temp, dst2.data(), valueNum, bti.bti[x]);
> -        if(x > 0) {
> -          sel.push();
> -            if(sel.isScalarReg(dst[0].reg())) {
> -              sel.curr.noMask = 1;
> -              sel.curr.execWidth = 1;
> -            }
> -            for (uint32_t y = 0; y < valueNum; y++)
> -              sel.ADD(dst[y], dst[y], dst2[y]);
> -          sel.pop();
> -        }
> -      }
> +        //GenRegister temp = getRelativeAddress(sel, addr, sel.selReg(bti.base, ir::TYPE_U32));
> +
> +        GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
> +        GenRegister tmp = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
> +        sel.UNTYPED_READ(addr, dst.data(), valueNum, b, bti.isConst ? NULL : &tmp);
>      }
>  
>      void emitUntypedRead(Selection::Opaque &sel,
> @@ -3134,10 +3211,9 @@ namespace gbe
>        using namespace ir;
>        const uint32_t valueNum = insn.getValueNum();
>        vector<GenRegister> dst(valueNum);
> -      vector<GenRegister> dst2(valueNum);
>        for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
> -        dst2[dstID] = dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
> -      readDWord(sel, dst, dst2, addr, valueNum, bti);
> +        dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
> +      readDWord(sel, dst, addr, valueNum, bti);
>      }
>  
>      void emitDWordGather(Selection::Opaque &sel,
> @@ -3146,15 +3222,15 @@ namespace gbe
>                           ir::BTI bti) const
>      {
>        using namespace ir;
> -      GBE_ASSERT(bti.count == 1);
> -      const uint32_t isUniform = sel.isScalarReg(insn.getValue(0));
> +      GBE_ASSERT(bti.isConst == 1);
>        GBE_ASSERT(insn.getValueNum() == 1);
> +      const uint32_t isUniform = sel.isScalarReg(insn.getValue(0));
>  
>        if(isUniform) {
>          GenRegister dst = sel.selReg(insn.getValue(0), ir::TYPE_U32);
>          sel.push();
>            sel.curr.noMask = 1;
> -          sel.SAMPLE(&dst, 1, &addr, 1, bti.bti[0], 0, true, true);
> +          sel.SAMPLE(&dst, 1, &addr, 1, bti.imm, 0, true, true);
>          sel.pop();
>          return;
>        }
> @@ -3170,7 +3246,7 @@ namespace gbe
>          sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
>        sel.pop();
>  
> -      sel.DWORD_GATHER(dst, addrDW, bti.bti[0]);
> +      sel.DWORD_GATHER(dst, addrDW, bti.imm);
>      }
>  
>      void emitRead64(Selection::Opaque &sel,
> @@ -3182,9 +3258,10 @@ namespace gbe
>        const uint32_t valueNum = insn.getValueNum();
>        /* XXX support scalar only right now. */
>        GBE_ASSERT(valueNum == 1);
> -      GBE_ASSERT(bti.count == 1);
> +      GBE_ASSERT(bti.isConst == 1);
>        vector<GenRegister> dst(valueNum);
> -      GenRegister tmpAddr = getRelativeAddress(sel, addr, bti.bti[0]);
> +      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
> +      GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
>        for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
>          dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
>  
> @@ -3194,9 +3271,9 @@ namespace gbe
>            tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
>          }
>  
> -        sel.READ64(tmpAddr, dst.data(), tmp.data(), valueNum, bti.bti[0], true);
> +        sel.READ64(addr, dst.data(), tmp.data(), valueNum, b, true, bti.isConst ? NULL : &tmpFlag);
>        } else {
> -        sel.READ64(tmpAddr, dst.data(), NULL, valueNum, bti.bti[0], false);
> +        sel.READ64(addr, dst.data(), NULL, valueNum, b, false, bti.isConst ? NULL : &tmpFlag);
>        }
>      }
>  
> @@ -3205,12 +3282,16 @@ namespace gbe
>                          GenRegister address,
>                          GenRegister dst,
>                          bool isUniform,
> -                        uint8_t bti) const
> +                        ir::BTI bti) const
>      {
>        using namespace ir;
>          Register tmpReg = sel.reg(FAMILY_DWORD, isUniform);
>          GenRegister tmpAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
>          GenRegister tmpData = sel.selReg(tmpReg, ir::TYPE_U32);
> +
> +        GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
> +        GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
> +
>          // Get dword aligned addr
>          sel.push();
>            if (isUniform) {
> @@ -3222,7 +3303,7 @@ namespace gbe
>          sel.push();
>            if (isUniform)
>              sel.curr.noMask = 1;
> -          sel.UNTYPED_READ(tmpAddr, &tmpData, 1, bti);
> +          sel.UNTYPED_READ(tmpAddr, &tmpData, 1, b, bti.isConst ? NULL : &tmpFlag);
>  
>            if (isUniform)
>              sel.curr.execWidth = 1;
> @@ -3258,14 +3339,11 @@ namespace gbe
>  
>        uint32_t tmpRegNum = (typeSize*valueNum + 3) / 4;
>        vector<GenRegister> tmp(tmpRegNum);
> -      vector<GenRegister> tmp2(tmpRegNum);
> -      vector<Register> tmpReg(tmpRegNum);
>        for(uint32_t i = 0; i < tmpRegNum; i++) {
> -        tmpReg[i] = sel.reg(FAMILY_DWORD, isUniform);
> -        tmp2[i] = tmp[i] = sel.selReg(tmpReg[i], ir::TYPE_U32);
> +        tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
>        }
>  
> -      readDWord(sel, tmp, tmp2, address, tmpRegNum, bti);
> +      readDWord(sel, tmp, address, tmpRegNum, bti);
>  
>        for(uint32_t i = 0; i < tmpRegNum; i++) {
>          unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
> @@ -3370,7 +3448,7 @@ namespace gbe
>                sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
>              sel.pop();
>            }
> -          readDWord(sel, t1, t2, alignedAddr, width, bti);
> +          readDWord(sel, t1, alignedAddr, width, bti);
>            remainedReg -= width;
>            pos += width;
>          } while(remainedReg);
> @@ -3389,24 +3467,8 @@ namespace gbe
>          GBE_ASSERT(insn.getValueNum() == 1);
>          const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType());
>          GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
> -        GenRegister tmp = value;
>  
> -        for (int x = 0; x < bti.count; x++) {
> -          if (x > 0)
> -            tmp = sel.selReg(sel.reg(family, isUniform), insn.getValueType());
> -
> -          GenRegister addr = getRelativeAddress(sel, address, bti.bti[x]);
> -          readByteAsDWord(sel, elemSize, addr, tmp, isUniform, bti.bti[x]);
> -          if (x > 0) {
> -            sel.push();
> -              if (isUniform) {
> -                sel.curr.noMask = 1;
> -                sel.curr.execWidth = 1;
> -              }
> -              sel.ADD(value, value, tmp);
> -            sel.pop();
> -          }
> -        }
> +        readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
>        }
>      }
>  
> @@ -3422,30 +3484,33 @@ namespace gbe
>        sel.INDIRECT_MOVE(dst, src);
>      }
>  
> -    INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, uint8_t bti) const {
> -      if (bti == 0xfe || bti == BTI_CONSTANT)
> -        return address;
> -
> -      sel.push();
> -        sel.curr.noMask = 1;
> -        if (GenRegister::hstride_size(address) == 0)
> -          sel.curr.execWidth = 1;
> -        GenRegister temp = sel.selReg(sel.reg(ir::FAMILY_DWORD, sel.curr.execWidth == 1), ir::TYPE_U32);
> -        sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti), ir::TYPE_U32)));
> -      sel.pop();
> -      return temp;
> -    }
> -    // check whether all binded table index point to constant memory
>      INLINE bool isAllConstant(const ir::BTI &bti) const {
> -      for (int x = 0; x < bti.count; x++) {
> -         if (bti.bti[x] != BTI_CONSTANT)
> -           return false;
> +      if (bti.isConst && bti.imm == BTI_CONSTANT)
> +        return true;
> +      return false;
> +    }
> +
> +    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::LoadInstruction &insn) const {
> +      using namespace ir;
> +      SelectionDAG *child0 = dag.child[0];
> +      ir::BTI b;
> +      if (insn.isFixedBTI()) {
> +        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
> +        const auto imm = immInsn.getImmediate();
> +        b.isConst = 1;
> +        b.imm = imm.getIntegerValue();
> +      } else {
> +        b.isConst = 0;
> +        b.reg = insn.getBTI();
>        }
> -      return true;
> +      return b;
>      }
>  
> -    INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn, bool &markChildren) const {
> +    /*! Implements base class */
> +    virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
> +    {
>        using namespace ir;
> +      const ir::LoadInstruction &insn = cast<ir::LoadInstruction>(dag.insn);
>        GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
>        GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
>                   insn.getAddressSpace() == MEM_CONSTANT ||
> @@ -3453,9 +3518,11 @@ namespace gbe
>                   insn.getAddressSpace() == MEM_LOCAL ||
>                   insn.getAddressSpace() == MEM_MIXED);
>        //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
> +
> +      BTI bti = getBTI(dag, insn);
> +
>        const Type type = insn.getValueType();
>        const uint32_t elemSize = getByteScatterGatherSize(type);
> -      const BTI &bti = insn.getBTI();
>        bool allConstant = isAllConstant(bti);
>  
>        if (allConstant) {
> @@ -3480,65 +3547,78 @@ namespace gbe
>          else
>            this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
>        }
> +
> +      if (insn.isFixedBTI()) {
> +        markAllChildrenExceptBTI(dag);
> +      } else {
> +        markAllChildren(dag);
> +      }
>        return true;
>      }
> -    DECL_CTOR(LoadInstruction, 1, 1);
>    };
> -
> -  /*! Store instruction pattern */
> -  DECL_PATTERN(StoreInstruction)
> +  class StoreInstructionPattern : public SelectionPattern
>    {
> +  public:
> +    /*! Register the pattern for all opcodes of the family */
> +    StoreInstructionPattern(void) : SelectionPattern(1, 1) {
> +       this->opcodes.push_back(ir::OP_STORE);
> +    }
>      void emitUntypedWrite(Selection::Opaque &sel,
>                            const ir::StoreInstruction &insn,
> -                          GenRegister addr,
> -                          uint32_t bti) const
> +                          GenRegister address,
> +                          ir::BTI &bti) const
>      {
>        using namespace ir;
>        const uint32_t valueNum = insn.getValueNum();
>        vector<GenRegister> value(valueNum);
> +      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
>  
> -      addr = GenRegister::retype(addr, GEN_TYPE_F);
>        for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
> -        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
> -      sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
> +        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_UD);
> +      GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
> +      sel.UNTYPED_WRITE(address, value.data(), valueNum, b, bti.isConst? NULL : &tmp);
>      }
>  
>      void emitWrite64(Selection::Opaque &sel,
>                       const ir::StoreInstruction &insn,
> -                     GenRegister addr,
> -                     uint32_t bti) const
> +                     GenRegister address,
> +                     ir::BTI &bti) const
>      {
>        using namespace ir;
>        const uint32_t valueNum = insn.getValueNum();
>        /* XXX support scalar only right now. */
>        GBE_ASSERT(valueNum == 1);
> -      addr = GenRegister::retype(addr, GEN_TYPE_UD);
> +      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
>        vector<GenRegister> src(valueNum);
>  
>        for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
>          src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
>  
> +      GenRegister tmpFlag = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
> +
>        if (sel.hasLongType()) {
>          vector<GenRegister> tmp(valueNum);
>          for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
>            tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
>          }
> -        sel.WRITE64(addr, src.data(), tmp.data(), valueNum, bti, true);
> +        sel.WRITE64(address, src.data(), tmp.data(), valueNum, b, true, bti.isConst? NULL : &tmpFlag);
>        } else {
> -        sel.WRITE64(addr, src.data(), NULL, valueNum, bti, false);
> +        sel.WRITE64(address, src.data(), NULL, valueNum, b, false, bti.isConst? NULL : &tmpFlag);
>        }
>      }
>  
>      void emitByteScatter(Selection::Opaque &sel,
>                           const ir::StoreInstruction &insn,
>                           const uint32_t elemSize,
> -                         GenRegister addr,
> -                         uint32_t bti,
> +                         GenRegister address,
> +                         ir::BTI &bti,
>                           bool isUniform) const
>      {
>        using namespace ir;
>        uint32_t valueNum = insn.getValueNum();
>  
> +      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
> +      GenRegister tmpFlag = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
>        if(valueNum > 1) {
>          const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
>          vector<GenRegister> value(valueNum);
> @@ -3558,11 +3638,12 @@ namespace gbe
>            sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, typeSize, 4/typeSize);
>          }
>  
> -        sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti);
> +        sel.UNTYPED_WRITE(address, tmp.data(), tmpRegNum, b, bti.isConst ? NULL : &tmpFlag);
>        } else {
>          const GenRegister value = sel.selReg(insn.getValue(0));
>          GBE_ASSERT(insn.getValueNum() == 1);
>          const GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
> +
>          sel.push();
>            if (isUniform) {
>              sel.curr.noMask = 1;
> @@ -3574,47 +3655,52 @@ namespace gbe
>            else if (elemSize == GEN_BYTE_SCATTER_BYTE)
>              sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
>          sel.pop();
> -        sel.BYTE_SCATTER(addr, tmp, elemSize, bti);
> +        sel.BYTE_SCATTER(address, tmp, elemSize, b, bti.isConst ? NULL : &tmpFlag);
>        }
>      }
>  
> -    INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, uint8_t bti, bool isUniform) const {
> -      if(bti == 0xfe)
> -        return address;
>  
> -      sel.push();
> -        sel.curr.noMask = 1;
> -        if (isUniform)
> -          sel.curr.execWidth = 1;
> -        GenRegister temp = sel.selReg(sel.reg(ir::FAMILY_DWORD, isUniform), ir::TYPE_U32);
> -        sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti), ir::TYPE_U32)));
> -      sel.pop();
> -      return temp;
> +    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::StoreInstruction &insn) const {
> +      using namespace ir;
> +      SelectionDAG *child0 = dag.child[0];
> +      ir::BTI b;
> +      if (insn.isFixedBTI()) {
> +        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
> +        const auto imm = immInsn.getImmediate();
> +        b.isConst = 1;
> +        b.imm = imm.getIntegerValue();
> +      } else {
> +        b.isConst = 0;
> +        b.reg = insn.getBTI();
> +      }
> +      return b;
>      }
> -
> -    INLINE bool emitOne(Selection::Opaque &sel, const ir::StoreInstruction &insn, bool &markChildren) const
> +    virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
>      {
>        using namespace ir;
> +      const ir::StoreInstruction &insn = cast<ir::StoreInstruction>(dag.insn);
> +      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
>        const Type type = insn.getValueType();
>        const uint32_t elemSize = getByteScatterGatherSize(type);
> -      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
>  
>        const bool isUniform = sel.isScalarReg(insn.getAddress()) && sel.isScalarReg(insn.getValue(0));
> +      BTI bti = getBTI(dag, insn);
>  
> -      BTI bti = insn.getBTI();
> -      for (int x = 0; x < bti.count; x++) {
> -        GenRegister temp = getRelativeAddress(sel, address, bti.bti[x], isUniform);
> -        if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
> -          this->emitWrite64(sel, insn, temp, bti.bti[x]);
> -        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
> -          this->emitUntypedWrite(sel, insn, temp,  bti.bti[x]);
> -        else {
> -          this->emitByteScatter(sel, insn, elemSize, temp, bti.bti[x], isUniform);
> -        }
> +      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
> +        this->emitWrite64(sel, insn, address, bti);
> +      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
> +        this->emitUntypedWrite(sel, insn, address,  bti);
> +      else {
> +        this->emitByteScatter(sel, insn, elemSize, address, bti, isUniform);
> +      }
> +
> +      if (insn.isFixedBTI()) {
> +        markAllChildrenExceptBTI(dag);
> +      } else {
> +        markAllChildren(dag);
>        }
>        return true;
>      }
> -    DECL_CTOR(StoreInstruction, 1, 1);
>    };
>  
>    /*! Compare instruction pattern */
> @@ -4178,38 +4264,61 @@ namespace gbe
>      DECL_CTOR(ConvertInstruction, 1, 1);
>    };
>  
> -  /*! Convert instruction pattern */
> -  DECL_PATTERN(AtomicInstruction)
> +  /*! atomic instruction pattern */
> +  class AtomicInstructionPattern : public SelectionPattern
>    {
> -    INLINE bool emitOne(Selection::Opaque &sel, const ir::AtomicInstruction &insn, bool &markChildren) const
> -    {
> +  public:
> +    AtomicInstructionPattern(void) : SelectionPattern(1,1) {
> +      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
> +        if (ir::isOpcodeFrom<ir::AtomicInstruction>(ir::Opcode(op)) == true)
> +          this->opcodes.push_back(ir::Opcode(op));
> +    }
> +
> +    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::AtomicInstruction &insn) const {
> +      using namespace ir;
> +      SelectionDAG *child0 = dag.child[0];
> +      ir::BTI b;
> +      if (insn.isFixedBTI()) {
> +        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
> +        const auto imm = immInsn.getImmediate();
> +        b.isConst = 1;
> +        b.imm = imm.getIntegerValue();
> +      } else {
> +        b.isConst = 0;
> +        b.reg = insn.getBTI();
> +      }
> +      return b;
> +    }
> +
> +    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
>        using namespace ir;
> +      const ir::AtomicInstruction &insn = cast<ir::AtomicInstruction>(dag.insn);
> +
> +      ir::BTI b = getBTI(dag, insn);
>        const AtomicOps atomicOp = insn.getAtomicOpcode();
> -      const AddressSpace space = insn.getAddressSpace();
> -      const uint32_t srcNum = insn.getSrcNum();
> +      unsigned srcNum = insn.getSrcNum();
> +      unsigned opNum = srcNum - 1;
>  
> -      GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32);   //address
> -      GenRegister src1 = src0, src2 = src0;
> -      if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
> -      if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
>        GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
> +      GenRegister bti =  b.isConst ? GenRegister::immud(b.imm) : sel.selReg(b.reg, ir::TYPE_U32);
> +      GenRegister src0 = sel.selReg(insn.getSrc(1), TYPE_U32);   //address
> +      GenRegister src1 = src0, src2 = src0;
> +      if(srcNum > 2) src1 = sel.selReg(insn.getSrc(2), TYPE_U32);
> +      if(srcNum > 3) src2 = sel.selReg(insn.getSrc(3), TYPE_U32);
> +
> +      GenRegister flagTemp = sel.selReg(sel.reg(FAMILY_WORD, true), TYPE_U16);
> +
>        GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
> -      if(space == MEM_LOCAL) {
> -        sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, 0xfe);
> +
> +      sel.ATOMIC(dst, genAtomicOp, opNum, src0, src1, src2, bti, b.isConst ? NULL : &flagTemp);
> +
> +      if (insn.isFixedBTI()) {
> +        markAllChildrenExceptBTI(dag);
>        } else {
> -        ir::BTI b = insn.getBTI();
> -        for (int x = 0; x < b.count; x++) {
> -          sel.push();
> -            sel.curr.noMask = 1;
> -            GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
> -            sel.ADD(temp, src0, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(b.bti[x]), ir::TYPE_U32)));
> -          sel.pop();
> -          sel.ATOMIC(dst, genAtomicOp, srcNum, temp, src1, src2, b.bti[x]);
> -        }
> +        markAllChildren(dag);
>        }
>        return true;
>      }
> -    DECL_CTOR(AtomicInstruction, 1, 1);
>    };
>  
>    /*! Select instruction pattern */
> diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
> index 7c9bce5..37963f9 100644
> --- a/backend/src/backend/gen_insn_selection.hpp
> +++ b/backend/src/backend/gen_insn_selection.hpp
> @@ -100,7 +100,7 @@ namespace gbe
>        struct {
>          /*! Store bti for loads/stores and function for math, atomic and compares */
>          uint16_t function:8;
> -        /*! elemSize for byte scatters / gathers, elemNum for untyped msg, bti for atomic */
> +        /*! elemSize for byte scatters / gathers, elemNum for untyped msg, operand number for atomic */
>          uint16_t elem:8;
>        };
>        struct {
> @@ -149,14 +149,7 @@ namespace gbe
>      INLINE uint32_t getbti() const {
>        GBE_ASSERT(isRead() || isWrite());
>        switch (opcode) {
> -        case SEL_OP_ATOMIC: return extra.elem;
> -        case SEL_OP_BYTE_SCATTER:
> -        case SEL_OP_WRITE64:
> -        case SEL_OP_DWORD_GATHER:
> -        case SEL_OP_UNTYPED_WRITE:
> -        case SEL_OP_UNTYPED_READ:
> -        case SEL_OP_BYTE_GATHER:
> -        case SEL_OP_READ64: return extra.function;
> +        case SEL_OP_DWORD_GATHER: return extra.function;
>          case SEL_OP_SAMPLE: return extra.rdbti;
>          case SEL_OP_TYPED_WRITE: return extra.bti;
>          default:
> @@ -168,14 +161,7 @@ namespace gbe
>      INLINE void setbti(uint32_t bti) {
>        GBE_ASSERT(isRead() || isWrite());
>        switch (opcode) {
> -        case SEL_OP_ATOMIC: extra.elem = bti; return;
> -        case SEL_OP_BYTE_SCATTER:
> -        case SEL_OP_WRITE64:
> -        case SEL_OP_UNTYPED_WRITE:
> -        case SEL_OP_DWORD_GATHER:
> -        case SEL_OP_UNTYPED_READ:
> -        case SEL_OP_BYTE_GATHER:
> -        case SEL_OP_READ64: extra.function = bti; return;
> +        case SEL_OP_DWORD_GATHER: extra.function = bti; return;
>          case SEL_OP_SAMPLE: extra.rdbti = bti; return;
>          case SEL_OP_TYPED_WRITE: extra.bti = bti; return;
>          default:
> diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
> index 581f823..ed1f572 100644
> --- a/backend/src/backend/gen_register.hpp
> +++ b/backend/src/backend/gen_register.hpp
> @@ -838,6 +838,8 @@ namespace gbe
>        reg.absolute = 0;
>        reg.vstride = 0;
>        reg.hstride = 0;
> +      reg.a0_subnr = 0;
> +      reg.addr_imm = 0;
>        return reg;
>      }
>  
> diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
> index 8c171f5..3637ebb 100644
> --- a/backend/src/backend/program.h
> +++ b/backend/src/backend/program.h
> @@ -103,6 +103,7 @@ enum gbe_curbe_type {
>    GBE_CURBE_ONE,
>    GBE_CURBE_LANE_ID,
>    GBE_CURBE_SLM_OFFSET,
> +  GBE_CURBE_BTI_UTIL,
>  };
>  
>  /*! Extra arguments use the negative range of sub-values */
> diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
> index af65ff3..54265d0 100644
> --- a/backend/src/ir/context.hpp
> +++ b/backend/src/ir/context.hpp
> @@ -190,22 +190,22 @@ namespace ir {
>  
>      /*! LOAD with the destinations directly specified */
>      template <typename... Args>
> -    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
> +    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, bool fixedBTI, Register bti, Args...values)
>      {
>        const Tuple index = this->tuple(values...);
>        const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
>        GBE_ASSERT(valueNum > 0);
> -      this->LOAD(type, index, offset, space, valueNum, dwAligned, bti);
> +      this->LOAD(type, index, offset, space, valueNum, dwAligned, fixedBTI, bti);
>      }
>  
>      /*! STORE with the sources directly specified */
>      template <typename... Args>
> -    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
> +    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, bool fixedBTI, Register bti, Args...values)
>      {
>        const Tuple index = this->tuple(values...);
>        const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
>        GBE_ASSERT(valueNum > 0);
> -      this->STORE(type, index, offset, space, valueNum, dwAligned, bti);
> +      this->STORE(type, index, offset, space, valueNum, dwAligned, fixedBTI, bti);
>      }
>      void appendSurface(uint8_t bti, Register reg) { fn->appendSurface(bti, reg); }
>  
> diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
> index 7723b90..fe8807e 100644
> --- a/backend/src/ir/instruction.cpp
> +++ b/backend/src/ir/instruction.cpp
> @@ -318,14 +318,14 @@ namespace ir {
>  
>      class ALIGNED_INSTRUCTION AtomicInstruction :
>        public BasePolicy,
> -      public TupleSrcPolicy<AtomicInstruction>,
>        public NDstPolicy<AtomicInstruction, 1>
>      {
>      public:
>        AtomicInstruction(AtomicOps atomicOp,
>                           Register dst,
>                           AddressSpace addrSpace,
> -                         BTI bti,
> +                         Register bti,
> +                         bool fixedBTI,
>                           Tuple src)
>        {
>          this->opcode = OP_ATOMIC;
> @@ -334,23 +334,43 @@ namespace ir {
>          this->src = src;
>          this->addrSpace = addrSpace;
>          this->bti = bti;
> +        this->fixedBTI = fixedBTI ? 1: 0;
>          srcNum = 2;
>          if((atomicOp == ATOMIC_OP_INC) ||
>            (atomicOp == ATOMIC_OP_DEC))
>            srcNum = 1;
>          if(atomicOp == ATOMIC_OP_CMPXCHG)
>            srcNum = 3;
> +        srcNum++;
>        }
> +      INLINE Register getSrc(const Function &fn, uint32_t ID) const {
> +        GBE_ASSERTM(ID < srcNum, "Out-of-bound source register for atomic");
> +        if (ID == 0u)
> +          return bti;
> +        else
> +          return fn.getRegister(src, ID -1);
> +      }
> +      INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
> +        GBE_ASSERTM(ID < srcNum, "Out-of-bound source register for atomic");
> +        if (ID == 0u)
> +          bti = reg;
> +        else
> +          fn.setRegister(src, ID - 1, reg);
> +      }
> +      INLINE uint32_t getSrcNum(void) const { return srcNum; }
> +
>        INLINE AddressSpace getAddressSpace(void) const { return this->addrSpace; }
> -      INLINE BTI getBTI(void) const { return bti; }
> +      INLINE Register getBTI(void) const { return bti; }
> +      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
>        INLINE AtomicOps getAtomicOpcode(void) const { return this->atomicOp; }
>        INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
>        INLINE void out(std::ostream &out, const Function &fn) const;
>        Register dst[1];
>        Tuple src;
>        AddressSpace addrSpace; //!< Address space
> -      BTI bti;               //!< bti
> -      uint8_t srcNum:2;     //!<Source Number
> +      Register bti;               //!< bti
> +      uint8_t fixedBTI:1;      //!< fixed bti or not
> +      uint8_t srcNum:3;     //!<Source Number
>        AtomicOps atomicOp:6;     //!<Source Number
>      };
>  
> @@ -410,7 +430,7 @@ namespace ir {
>  
>      class ALIGNED_INSTRUCTION LoadInstruction :
>        public BasePolicy,
> -      public NSrcPolicy<LoadInstruction, 1>
> +      public NSrcPolicy<LoadInstruction, 2>
>      {
>      public:
>        LoadInstruction(Type type,
> @@ -419,7 +439,8 @@ namespace ir {
>                        AddressSpace addrSpace,
>                        uint32_t valueNum,
>                        bool dwAligned,
> -                      BTI bti)
> +                      bool fixedBTI,
> +                      Register bti)
>        {
>          GBE_ASSERT(valueNum < 128);
>          this->opcode = OP_LOAD;
> @@ -429,6 +450,7 @@ namespace ir {
>          this->addrSpace = addrSpace;
>          this->valueNum = valueNum;
>          this->dwAligned = dwAligned ? 1 : 0;
> +        this->fixedBTI = fixedBTI ? 1 : 0;
>          this->bti = bti;
>        }
>        INLINE Register getDst(const Function &fn, uint32_t ID) const {
> @@ -443,16 +465,18 @@ namespace ir {
>        INLINE Type getValueType(void) const { return type; }
>        INLINE uint32_t getValueNum(void) const { return valueNum; }
>        INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
> -      INLINE BTI getBTI(void) const { return bti; }
> +      INLINE Register getBTI(void) const { return bti; }
>        INLINE bool wellFormed(const Function &fn, std::string &why) const;
>        INLINE void out(std::ostream &out, const Function &fn) const;
>        INLINE bool isAligned(void) const { return !!dwAligned; }
> +      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
>        Type type;              //!< Type to store
>        Register src[0];        //!< Address where to load from
> +      Register bti;
>        Register offset;        //!< Alias to make it similar to store
>        Tuple values;           //!< Values to load
>        AddressSpace addrSpace; //!< Where to load
> -      BTI bti;
> +      uint8_t fixedBTI:1;
>        uint8_t valueNum:7;     //!< Number of values to load
>        uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
>      };
> @@ -467,7 +491,8 @@ namespace ir {
>                         AddressSpace addrSpace,
>                         uint32_t valueNum,
>                         bool dwAligned,
> -                       BTI bti)
> +                       bool fixedBTI,
> +                       Register bti)
>        {
>          GBE_ASSERT(valueNum < 255);
>          this->opcode = OP_STORE;
> @@ -477,35 +502,42 @@ namespace ir {
>          this->addrSpace = addrSpace;
>          this->valueNum = valueNum;
>          this->dwAligned = dwAligned ? 1 : 0;
> +        this->fixedBTI = fixedBTI ? 1 : 0;
>          this->bti = bti;
>        }
>        INLINE Register getSrc(const Function &fn, uint32_t ID) const {
> -        GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
> +        GBE_ASSERTM(ID < valueNum + 2u, "Out-of-bound source register for store");
>          if (ID == 0u)
> +          return bti;
> +        else if (ID == 1u)
>            return offset;
>          else
> -          return fn.getRegister(values, ID - 1);
> +          return fn.getRegister(values, ID - 2);
>        }
>        INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
> -        GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
> +        GBE_ASSERTM(ID < valueNum + 2u, "Out-of-bound source register for store");
>          if (ID == 0u)
> +          bti = reg;
> +        else if (ID == 1u)
>            offset = reg;
>          else
> -          fn.setRegister(values, ID - 1, reg);
> +          fn.setRegister(values, ID - 2, reg);
>        }
> -      INLINE uint32_t getSrcNum(void) const { return valueNum + 1u; }
> +      INLINE uint32_t getSrcNum(void) const { return valueNum + 2u; }
>        INLINE uint32_t getValueNum(void) const { return valueNum; }
>        INLINE Type getValueType(void) const { return type; }
>        INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
> -      INLINE BTI getBTI(void) const { return bti; }
> +      INLINE Register getBTI(void) const { return bti; }
>        INLINE bool wellFormed(const Function &fn, std::string &why) const;
>        INLINE void out(std::ostream &out, const Function &fn) const;
>        INLINE bool isAligned(void) const { return !!dwAligned; }
> +      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
>        Type type;              //!< Type to store
> +      Register bti;
>        Register offset;        //!< First source is the offset where to store
>        Tuple values;           //!< Values to store
>        AddressSpace addrSpace; //!< Where to store
> -      BTI bti;                //!< Which btis need access
> +      uint8_t fixedBTI:1;                //!< Which btis need access
>        uint8_t valueNum:7;     //!< Number of values to store
>        uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
>        Register dst[0];        //!< No destination
> @@ -961,10 +993,12 @@ namespace ir {
>          return false;
>        if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false))
>          return false;
> -      for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
> -        if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID), fn, whyNot) == false))
> +      for (uint32_t srcID = 0; srcID < srcNum-1; ++srcID)
> +        if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID+1u), fn, whyNot) == false))
>            return false;
>  
> +      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, bti, fn, whyNot) == false))
> +        return false;
>        return true;
>      }
>  
> @@ -1165,12 +1199,10 @@ namespace ir {
>        this->outOpcode(out);
>        out << "." << addrSpace;
>        out << " %" << this->getDst(fn, 0);
> -      out << " {" << "%" << this->getSrc(fn, 0) << "}";
> -      for (uint32_t i = 1; i < srcNum; ++i)
> +      out << " {" << "%" << this->getSrc(fn, 1) << "}";
> +      for (uint32_t i = 2; i < srcNum; ++i)
>          out << " %" << this->getSrc(fn, i);
> -      out << " bti";
> -      for (uint32_t i = 0; i < bti.count; ++i)
> -        out << ": " << (int)bti.bti[i];
> +      out <<  (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
>      }
>  
>  
> @@ -1204,22 +1236,18 @@ namespace ir {
>        for (uint32_t i = 0; i < valueNum; ++i)
>          out << "%" << this->getDst(fn, i) << (i != (valueNum-1u) ? " " : "");
>        out << "}";
> -      out << " %" << this->getSrc(fn, 0);
> -      out << " bti";
> -      for (uint32_t i = 0; i < bti.count; ++i)
> -        out << ": " << (int)bti.bti[i];
> +      out << " %" << this->getSrc(fn, 1);
> +      out << (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
>      }
>  
>      INLINE void StoreInstruction::out(std::ostream &out, const Function &fn) const {
>        this->outOpcode(out);
>        out << "." << type << "." << addrSpace << (dwAligned ? "." : ".un") << "aligned";
> -      out << " %" << this->getSrc(fn, 0) << " {";
> +      out << " %" << this->getSrc(fn, 1) << " {";
>        for (uint32_t i = 0; i < valueNum; ++i)
> -        out << "%" << this->getSrc(fn, i+1) << (i != (valueNum-1u) ? " " : "");
> +        out << "%" << this->getSrc(fn, i+2) << (i != (valueNum-1u) ? " " : "");
>        out << "}";
> -      out << " bti";
> -      for (uint32_t i = 0; i < bti.count; ++i)
> -        out << ": " << (int)bti.bti[i];
> +      out <<  (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
>      }
>  
>      INLINE void ReadARFInstruction::out(std::ostream &out, const Function &fn) const {
> @@ -1560,18 +1588,18 @@ DECL_MEM_FN(BitCastInstruction, Type, getDstType(void), getDstType())
>  DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
>  DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
>  DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
> -DECL_MEM_FN(AtomicInstruction, BTI, getBTI(void), getBTI())
>  DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode())
> +DECL_MEM_FN(AtomicInstruction, bool, isFixedBTI(void), isFixedBTI())
>  DECL_MEM_FN(StoreInstruction, Type, getValueType(void), getValueType())
>  DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
>  DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
> -DECL_MEM_FN(StoreInstruction, BTI, getBTI(void), getBTI())
>  DECL_MEM_FN(StoreInstruction, bool, isAligned(void), isAligned())
> +DECL_MEM_FN(StoreInstruction, bool, isFixedBTI(void), isFixedBTI())
>  DECL_MEM_FN(LoadInstruction, Type, getValueType(void), getValueType())
>  DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
>  DECL_MEM_FN(LoadInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
> -DECL_MEM_FN(LoadInstruction, BTI, getBTI(void), getBTI())
>  DECL_MEM_FN(LoadInstruction, bool, isAligned(void), isAligned())
> +DECL_MEM_FN(LoadInstruction, bool, isFixedBTI(void), isFixedBTI())
>  DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
>  DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
>  DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated())
> @@ -1735,8 +1763,8 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
>    }
>  
>    // For all unary functions with given opcode
> -  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, BTI bti, Tuple src) {
> -    return internal::AtomicInstruction(atomicOp, dst, space, bti, src).convert();
> +  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, Register bti, bool fixedBTI, Tuple src) {
> +    return internal::AtomicInstruction(atomicOp, dst, space, bti, fixedBTI, src).convert();
>    }
>  
>    // BRA
> @@ -1784,9 +1812,10 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
>                     AddressSpace space, \
>                     uint32_t valueNum, \
>                     bool dwAligned, \
> -                   BTI bti) \
> +                   bool fixedBTI, \
> +                   Register bti) \
>    { \
> -    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,bti).convert(); \
> +    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,fixedBTI,bti).convert(); \
>    }
>  
>    DECL_EMIT_FUNCTION(LOAD, LoadInstruction)
> diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
> index 436bfd2..23a7d00 100644
> --- a/backend/src/ir/instruction.hpp
> +++ b/backend/src/ir/instruction.hpp
> @@ -36,10 +36,13 @@
>  namespace gbe {
>  namespace ir {
>    struct BTI {
> -    uint8_t bti[MAX_MIXED_POINTER];
> -    uint8_t count;
> -    BTI() : count(0) {
> -      memset(bti, 0, MAX_MIXED_POINTER);
> +    uint8_t isConst; // whether fixed bti
> +    union {
> +      Register reg;  // mixed reg
> +      unsigned short imm;  // fixed bti
> +    };
> +
> +    BTI() : isConst(0) {
>      }
>      ~BTI() {}
>    };
> @@ -289,10 +292,12 @@ namespace ir {
>    class AtomicInstruction : public Instruction {
>    public:
>      /*! Where the address register goes */
> -    static const uint32_t addressIndex = 0;
> +    static const uint32_t btiIndex = 0;
> +    static const uint32_t addressIndex = 1;
>      /*! Address space that is manipulated here */
>      AddressSpace getAddressSpace(void) const;
> -    BTI getBTI(void) const;
> +    Register getBTI(void) const { return this->getSrc(btiIndex); }
> +    bool isFixedBTI(void) const;
>      /*! Return the atomic function code */
>      AtomicOps getAtomicOpcode(void) const;
>      /*! Return the register that contains the addresses */
> @@ -307,12 +312,14 @@ namespace ir {
>    class StoreInstruction : public Instruction {
>    public:
>      /*! Where the address register goes */
> -    static const uint32_t addressIndex = 0;
> +    static const uint32_t btiIndex = 0;
> +    static const uint32_t addressIndex = 1;
>      /*! Return the types of the values to store */
>      Type getValueType(void) const;
>      /*! Give the number of values the instruction is storing (srcNum-1) */
>      uint32_t getValueNum(void) const;
> -    BTI getBTI(void) const;
> +    Register getBTI(void) const { return this->getSrc(btiIndex); }
> +    bool isFixedBTI(void) const;
>      /*! Address space that is manipulated here */
>      AddressSpace getAddressSpace(void) const;
>      /*! DWORD aligned means untyped read for Gen. That is what matters */
> @@ -322,7 +329,7 @@ namespace ir {
>      /*! Return the register that contain value valueID */
>      INLINE Register getValue(uint32_t valueID) const {
>        GBE_ASSERT(valueID < this->getValueNum());
> -      return this->getSrc(valueID + 1u);
> +      return this->getSrc(valueID + 2u);
>      }
>      /*! Return true if the given instruction is an instance of this class */
>      static bool isClassOf(const Instruction &insn);
> @@ -343,8 +350,9 @@ namespace ir {
>      /*! DWORD aligned means untyped read for Gen. That is what matters */
>      bool isAligned(void) const;
>      /*! Return the register that contains the addresses */
> -    INLINE Register getAddress(void) const { return this->getSrc(0u); }
> -    BTI getBTI(void) const;
> +    INLINE Register getAddress(void) const { return this->getSrc(1u); }
> +    Register getBTI(void) const {return this->getSrc(0u);}
> +    bool isFixedBTI(void) const;
>      /*! Return the register that contain value valueID */
>      INLINE Register getValue(uint32_t valueID) const {
>        return this->getDst(valueID);
> @@ -697,7 +705,7 @@ namespace ir {
>    /*! F32TO16.{dstType <- srcType} dst src */
>    Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src);
>    /*! atomic dst addr.space {src1 {src2}} */
> -  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, BTI bti, Tuple src);
> +  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, Register bti, bool fixedBTI, Tuple src);
>    /*! bra labelIndex */
>    Instruction BRA(LabelIndex labelIndex);
>    /*! (pred) bra labelIndex */
> @@ -713,9 +721,9 @@ namespace ir {
>    /*! ret */
>    Instruction RET(void);
>    /*! load.type.space {dst1,...,dst_valueNum} offset value */
> -  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
> +  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, bool fixedBTI, Register bti);
>    /*! store.type.space offset {src1,...,src_valueNum} value */
> -  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
> +  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, bool fixedBTI, Register bti);
>    /*! loadi.type dst value */
>    Instruction LOADI(Type type, Register dst, ImmediateIndex value);
>    /*! sync.params... (see Sync instruction) */
> diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
> index 2f6539a..af9f698 100644
> --- a/backend/src/ir/profile.cpp
> +++ b/backend/src/ir/profile.cpp
> @@ -45,7 +45,8 @@ namespace ir {
>          "printf_buffer_pointer", "printf_index_buffer_pointer",
>          "dwblockip",
>          "lane_id",
> -        "invalid"
> +        "invalid",
> +        "bti_utility"
>      };
>  
>  #if GBE_DEBUG
> @@ -91,6 +92,7 @@ namespace ir {
>        DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0);
>        DECL_NEW_REG(FAMILY_DWORD, laneid, 0);
>        DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
> +      DECL_NEW_REG(FAMILY_DWORD, btiUtil, 1);
>      }
>  #undef DECL_NEW_REG
>  
> diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
> index 4de6fe0..9323824 100644
> --- a/backend/src/ir/profile.hpp
> +++ b/backend/src/ir/profile.hpp
> @@ -74,7 +74,8 @@ namespace ir {
>      static const Register dwblockip = Register(30);  // blockip
>      static const Register laneid = Register(31);  // lane id.
>      static const Register invalid = Register(32);  // used for valid comparation.
> -    static const uint32_t regNum = 33;             // number of special registers
> +    static const Register btiUtil = Register(33);  // used for mixed pointer as bti utility.
> +    static const uint32_t regNum = 34;             // number of special registers
>      extern const char *specialRegMean[];           // special register name.
>    } /* namespace ocl */
>  
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index 6bde7bf..0c29e03 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -87,6 +87,7 @@
>  #endif  /* LLVM_VERSION_MINOR <= 2 */
>  #include "llvm/Pass.h"
>  #include "llvm/PassManager.h"
> +#include "llvm/IR/IRBuilder.h"
>  #if LLVM_VERSION_MINOR <= 2
>  #include "llvm/Intrinsics.h"
>  #include "llvm/IntrinsicInst.h"
> @@ -290,11 +291,8 @@ namespace gbe
>      return ir::MEM_GLOBAL;
>    }
>  
> -  static INLINE ir::AddressSpace btiToGen(const ir::BTI &bti) {
> -    if (bti.count > 1)
> -      return ir::MEM_MIXED;
> -    uint8_t singleBti = bti.bti[0];
> -    switch (singleBti) {
> +  static INLINE ir::AddressSpace btiToGen(const unsigned bti) {
> +    switch (bti) {
>        case BTI_CONSTANT: return ir::MEM_CONSTANT;
>        case BTI_PRIVATE: return  ir::MEM_PRIVATE;
>        case BTI_LOCAL: return ir::MEM_LOCAL;
> @@ -485,7 +483,14 @@ namespace gbe
>  
>      map<Value *, SmallVector<Value *, 4>> pointerOrigMap;
>      typedef map<Value *, SmallVector<Value *, 4>>::iterator PtrOrigMapIter;
> -
> +    // map pointer source to bti
> +    map<Value *, unsigned> BtiMap;
> +    // map ptr to its bti register
> +    map<Value *, Value *> BtiValueMap;
> +    // map ptr to it's base
> +    map<Value *, Value *> pointerBaseMap;
> +
> +    typedef map<Value *, Value *>::iterator PtrBaseMapIter;
>      /*! We visit each function twice. Once to allocate the registers and once to
>       *  emit the Gen IR instructions
>       */
> @@ -501,6 +506,7 @@ namespace gbe
>      } ConstTypeId;
>  
>      LoopInfo *LI;
> +    Function *Func;
>      const Module *TheModule;
>      int btiBase;
>    public:
> @@ -547,22 +553,33 @@ namespace gbe
>        bool bKernel = isKernelFunction(F);
>        if(!bKernel) return false;
>  
> +      Func = &F;
> +      assignBti(F);
>        analyzePointerOrigin(F);
> +
>        LI = &getAnalysis<LoopInfo>();
>        emitFunction(F);
>        phiMap.clear();
>        globalPointer.clear();
>        pointerOrigMap.clear();
> +      BtiMap.clear();
> +      BtiValueMap.clear();
> +      pointerBaseMap.clear();
>        // Reset for next function
>        btiBase = BTI_RESERVED_NUM;
>        return false;
>      }
>      /*! Given a possible pointer value, find out the interested escape like
>          load/store or atomic instruction */
> -    void findPointerEscape(Value *ptr);
> +    void findPointerEscape(Value *ptr, std::set<Value *> &mixedPtr, bool recordMixed);
>      /*! For all possible pointers, GlobalVariable, function pointer argument,
>          alloca instruction, find their pointer escape points */
>      void analyzePointerOrigin(Function &F);
> +    unsigned getNewBti(Value *origin);
> +    void assignBti(Function &F);
> +    bool isSingleBti(Value *Val);
> +    Value *getBtiRegister(Value *v);
> +    Value *getPointerBase(Value *ptr);
>  
>      virtual bool doFinalization(Module &M) { return false; }
>      /*! handle global variable register allocation (local, constant space) */
> @@ -660,10 +677,10 @@ namespace gbe
>      // batch vec4/8/16 load/store
>      INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
>                    Value *llvmValue, const ir::Register ptr,
> -                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti,
> -                  bool dwAligned);
> +                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::Register bti,
> +                  bool dwAligned, bool fixedBTI);
>      // handle load of dword/qword with unaligned address
> -    void emitUnalignedDQLoadStore(Value *llvmPtr, Value *llvmValues, ir::AddressSpace addrSpace, ir::BTI &binding, bool isLoad, bool dwAligned);
> +    void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI);
>      void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
>      private:
>        ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
> @@ -675,7 +692,44 @@ namespace gbe
>  
>    char GenWriter::ID = 0;
>  
> -  void GenWriter::findPointerEscape(Value *ptr) {
> +  static void updatePointerSource(Value *parent, Value *theUser, Value *source, SmallVector<Value *, 4> &pointers) {
> +    if (isa<SelectInst>(theUser)) {
> +      SelectInst *si = dyn_cast<SelectInst>(theUser);
> +      if (si->getTrueValue() == parent)
> +        pointers[0] = source;
> +      else
> +        pointers[1] = source;
> +    } else if (isa<PHINode>(theUser)) {
> +      PHINode *phi = dyn_cast<PHINode>(theUser);
> +      unsigned opNum = phi->getNumIncomingValues();
> +      for (unsigned j = 0; j < opNum; j++) {
> +        if (phi->getIncomingValue(j) == parent) {
> +          pointers[j] = source;
> +        }
> +      }
> +    } else {
> +      pointers[0] = source;
> +    }
> +  }
> +
> +  bool isMixedPoint(Value *val, SmallVector<Value *, 4> &pointers) {
> +    Value *validSrc = NULL;
> +    unsigned i = 0;
> +    if (pointers.size() < 2) return false;
> +    while(i < pointers.size()) {
> +      if (pointers[i] != NULL && validSrc != NULL && pointers[i] != validSrc)
> +        return true;
> +      // when source is same as itself, we don't treat it as a new source
> +      // this often occurs for PHINode
> +      if (pointers[i] != NULL && validSrc == NULL && pointers[i] != val) {
> +        validSrc = pointers[i];
> +      }
> +      i++;
> +    }
> +    return false;
> +  }
> +
> +  void GenWriter::findPointerEscape(Value *ptr,  std::set<Value *> &mixedPtr, bool bFirstPass) {
>      std::vector<Value*> workList;
>      std::set<Value *> visited;
>  
> @@ -695,7 +749,52 @@ namespace gbe
>    #else
>          User *theUser = iter->getUser();
>    #endif
> -        if (visited.find(theUser) != visited.end()) continue;
> +        bool visitedInThisSource = visited.find(theUser) != visited.end();
> +
> +        if (isa<SelectInst>(theUser) || isa<PHINode>(theUser))
> +        {
> +          // reached from another source, update pointer source
> +          PtrOrigMapIter ptrIter = pointerOrigMap.find(theUser);
> +          if (ptrIter == pointerOrigMap.end()) {
> +            // create new one
> +            unsigned capacity = 1;
> +            if (isa<SelectInst>(theUser)) capacity = 2;
> +            if (isa<PHINode>(theUser)) {
> +              PHINode *phi = dyn_cast<PHINode>(theUser);
> +              capacity = phi->getNumIncomingValues();
> +            }
> +
> +            SmallVector<Value *, 4> pointers;
> +
> +            unsigned k = 0;
> +            while (k++ < capacity) {
> +              pointers.push_back(NULL);
> +            }
> +
> +            updatePointerSource(work, theUser, ptr, pointers);
> +            pointerOrigMap.insert(std::make_pair(theUser, pointers));
> +          } else {
> +            // update pointer source
> +            updatePointerSource(work, theUser, ptr, (*ptrIter).second);
> +          }
> +          ptrIter = pointerOrigMap.find(theUser);
> +
> +          if (isMixedPoint(theUser, (*ptrIter).second)) {
> +            // for the first pass, we need to record the mixed point instruction.
> +            // for the second pass, we don't need to go further, the reason is:
> +            // we always use it's 'direct mixed pointer parent' as origin, if we don't
> +            // stop here, we may set wrong pointer origin.
> +            if (bFirstPass)
> +              mixedPtr.insert(theUser);
> +            else
> +              continue;
> +          }
> +          // don't fall into dead loop,
> +          if (visitedInThisSource || theUser == ptr) {
> +            continue;
> +          }
> +        }
> +
>          // pointer address is used as the ValueOperand in store instruction, should be skipped
>          if (StoreInst *load = dyn_cast<StoreInst>(theUser)) {
>            if (load->getValueOperand() == work) {
> @@ -710,16 +809,30 @@ namespace gbe
>              Function *F = dyn_cast<CallInst>(theUser)->getCalledFunction();
>              if (!F || F->getIntrinsicID() != 0) continue;
>            }
> +          Value *pointer = NULL;
> +          if (isa<LoadInst>(theUser)) {
> +            pointer = dyn_cast<LoadInst>(theUser)->getPointerOperand();
> +          } else if (isa<StoreInst>(theUser)) {
> +            pointer = dyn_cast<StoreInst>(theUser)->getPointerOperand();
> +          } else if (isa<CallInst>(theUser)) {
> +            // atomic/read(write)image
> +            CallInst *ci = dyn_cast<CallInst>(theUser);
> +            pointer = ci->getArgOperand(0);
> +          } else {
> +            theUser->dump();
> +            GBE_ASSERT(0 && "Unknown instruction operating on pointers\n");
> +          }
>  
> -          PtrOrigMapIter ptrIter = pointerOrigMap.find(theUser);
> +          // load/store/atomic instruction, we have reached the end, stop further traversing
> +          PtrOrigMapIter ptrIter = pointerOrigMap.find(pointer);
>            if (ptrIter == pointerOrigMap.end()) {
>              // create new one
>              SmallVector<Value *, 4> pointers;
>              pointers.push_back(ptr);
> -            pointerOrigMap.insert(std::make_pair(theUser, pointers));
> +            pointerOrigMap.insert(std::make_pair(pointer, pointers));
>            } else {
> -            // append it
> -            (*ptrIter).second.push_back(ptr);
> +            // update the pointer source here,
> +            (*ptrIter).second[0] = ptr;
>            }
>          } else {
>            workList.push_back(theUser);
> @@ -727,28 +840,292 @@ namespace gbe
>        }
>      }
>    }
> +  bool GenWriter::isSingleBti(Value *Val) {
> +    // self + others same --> single
> +    // all same  ---> single
> +    if (!isa<SelectInst>(Val) && !isa<PHINode>(Val)) {
> +      return true;
> +    } else {
> +      PtrOrigMapIter iter = pointerOrigMap.find(Val);
> +      SmallVector<Value *, 4> &pointers = (*iter).second;
> +      unsigned srcNum = pointers.size();
> +      Value *source = NULL;
> +      for (unsigned x = 0; x < srcNum; x++) {
> +        // often happend in phiNode where one source is same as PHINode itself, skip it
> +        if (pointers[x] == Val) continue;
> +
> +        if (source == NULL) source = pointers[x];
> +        else {
> +          if (source != pointers[x])
> +            return false;
> +        }
> +      }
> +      return true;
> +    }
> +  }
> +  Value *GenWriter::getPointerBase(Value *ptr) {
> +    PtrBaseMapIter baseIter = pointerBaseMap.find(ptr);
> +    if (baseIter != pointerBaseMap.end()) {
> +      return baseIter->second;
> +    }
> +    typedef std::map<Value *, unsigned>::iterator BtiIter;
> +    // for pointers that already assigned a bti, it is the base pointer,
> +    BtiIter found = BtiMap.find(ptr);
> +    if (found != BtiMap.end()) {
> +      if (isa<PointerType>(ptr->getType())) {
> +        PointerType *ty = cast<PointerType>(ptr->getType());
> +        // only global pointer will have starting address
> +        if (ty->getAddressSpace() == 1) {
> +          return ptr;
> +        } else {
> +          return ConstantPointerNull::get(ty);
> +        }
> +      } else {
> +          PointerType *ty = PointerType::get(ptr->getType(), 0);
> +          return ConstantPointerNull::get(ty);
> +      }
> +    }
> +
> +    PtrOrigMapIter iter = pointerOrigMap.find(ptr);
> +    SmallVector<Value *, 4> &pointers = (*iter).second;
> +    if (isSingleBti(ptr)) {
> +      Value *base = getPointerBase(pointers[0]);
> +      pointerBaseMap.insert(std::make_pair(ptr, base));
> +      return base;
> +    } else {
> +      if (isa<SelectInst>(ptr)) {
> +          SelectInst *si = dyn_cast<SelectInst>(ptr);
> +          IRBuilder<> Builder(si->getParent());
> +
> +          Value *trueVal = getPointerBase((*iter).second[0]);
> +          Value *falseVal = getPointerBase((*iter).second[1]);
> +          Builder.SetInsertPoint(si);
> +          Value *base = Builder.CreateSelect(si->getCondition(), trueVal, falseVal);
> +          pointerBaseMap.insert(std::make_pair(ptr, base));
> +        return base;
           incorrect indent.

> +      } else if (isa<PHINode>(ptr)) {
> +          PHINode *phi = dyn_cast<PHINode>(ptr);
> +          IRBuilder<> Builder(phi->getParent());
> +          Builder.SetInsertPoint(phi);
> +
> +          PHINode *basePhi = Builder.CreatePHI(ptr->getType(), phi->getNumIncomingValues());
> +          unsigned srcNum = pointers.size();
> +          for (unsigned x = 0; x < srcNum; x++) {
> +            Value *base = NULL;
> +            if (pointers[x] != ptr) {
> +              base = getPointerBase(pointers[x]);
> +            } else {
> +              base = basePhi;
> +            }
> +            basePhi->addIncoming(base, phi->getIncomingBlock(x));
> +          }
> +          pointerBaseMap.insert(std::make_pair(ptr, basePhi));
> +          return basePhi;
> +      } else {
> +        ptr->dump();
> +        GBE_ASSERT(0 && "Unhandled instruction in getBtiRegister\n");
> +        return ptr;
> +      }
> +    }
> +  }
> +
> +  Value *GenWriter::getBtiRegister(Value *Val) {
> +    typedef std::map<Value *, unsigned>::iterator BtiIter;
> +    typedef std::map<Value *, Value *>::iterator BtiValueIter;
> +    BtiIter found = BtiMap.find(Val);
> +    BtiValueIter valueIter = BtiValueMap.find(Val);
> +    if (valueIter != BtiValueMap.end())
> +      return valueIter->second;
> +
> +    if (found != BtiMap.end()) {
> +      // the Val already got assigned an BTI, return it
> +      Value *bti = ConstantInt::get(IntegerType::get(Val->getContext(), 32), found->second);
> +      BtiValueMap.insert(std::make_pair(Val, bti));
> +      return bti;
> +    } else {
> +      if (isSingleBti(Val)) {
> +        PtrOrigMapIter iter = pointerOrigMap.find(Val);
> +        Value * bti = getBtiRegister((*iter).second[0]);
> +        BtiValueMap.insert(std::make_pair(Val, bti));
> +        return bti;
> +      } else {
> +        if (isa<SelectInst>(Val)) {
> +          SelectInst *si = dyn_cast<SelectInst>(Val);
> +
> +          IRBuilder<> Builder(si->getParent());
> +          PtrOrigMapIter iter = pointerOrigMap.find(Val);
> +          Value *trueVal = getBtiRegister((*iter).second[0]);
> +          Value *falseVal = getBtiRegister((*iter).second[1]);
> +          Builder.SetInsertPoint(si);
> +          Value *bti = Builder.CreateSelect(si->getCondition(), trueVal, falseVal);
> +          BtiValueMap.insert(std::make_pair(Val, bti));
> +          return bti;
> +        } else if (isa<PHINode>(Val)) {
> +          PHINode *phi = dyn_cast<PHINode>(Val);
> +          IRBuilder<> Builder(phi->getParent());
> +          Builder.SetInsertPoint(phi);
> +
> +          PHINode *btiPhi = Builder.CreatePHI(IntegerType::get(Val->getContext(), 32), phi->getNumIncomingValues());
> +          PtrOrigMapIter iter = pointerOrigMap.find(Val);
> +          SmallVector<Value *, 4> &pointers = (*iter).second;
> +          unsigned srcNum = pointers.size();
> +          for (unsigned x = 0; x < srcNum; x++) {
> +            Value *bti = NULL;
> +            if (pointers[x] != Val) {
> +              bti = getBtiRegister(pointers[x]);
> +            } else {
> +              bti = btiPhi;
> +            }
> +            btiPhi->addIncoming(bti, phi->getIncomingBlock(x));
> +          }
> +          BtiValueMap.insert(std::make_pair(Val, btiPhi));
> +          return btiPhi;
> +        } else {
> +          Val->dump();
> +          GBE_ASSERT(0 && "Unhandled instruction in getBtiRegister\n");
> +          return Val;
> +        }
> +      }
> +    }
> +  }
> +
> +  unsigned GenWriter::getNewBti(Value *origin) {
> +    unsigned new_bti = 0;
> +    if(origin->getName().equals(StringRef("__gen_ocl_printf_buf"))) {
> +      new_bti = btiBase;
> +      incBtiBase();
> +    } else if (origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
> +      new_bti = btiBase;
> +      incBtiBase();
> +    }
> +    else if (isa<GlobalVariable>(origin)
> +        && dyn_cast<GlobalVariable>(origin)->isConstant()) {
> +      new_bti = BTI_CONSTANT;
> +    } else {
> +      unsigned space = origin->getType()->getPointerAddressSpace();
> +      switch (space) {
> +        case 0:
> +          new_bti = BTI_PRIVATE;
> +          break;
> +        case 1:
> +        {
> +          new_bti = btiBase;
> +          incBtiBase();
> +          break;
> +        }
> +        case 2:
> +          new_bti = BTI_CONSTANT;
> +
> +          break;
> +        case 3:
> +          new_bti = BTI_LOCAL;
> +          break;
> +        default:
> +          GBE_ASSERT(0);
> +          break;
> +      }
> +    }
> +    return new_bti;
> +  }
> +  static bool isImageType(std::string typeName) {
> +    if (typeName.compare("image1d_t") == 0        ||
> +        typeName.compare("image1d_array_t") == 0  ||
> +        typeName.compare("image1d_buffer_t") == 0 ||
> +        typeName.compare("image2d_t") == 0        ||
> +        typeName.compare("image2d_array_t") == 0  ||
> +        typeName.compare("image2d_buffer_t") == 0 ||
> +        typeName.compare("image3d_t") == 0)
> +      return true;
> +    return false;
> +  }
The above function is already implemented in function.hpp.
You may consider to reuse it.

> +
> +  void GenWriter::assignBti(Function &F) {
assignBti has some duplicate code as emitFunctionPrototype().
It's better to refine it to reuse the existing code.


--
Thanks,
Zhigang Gong.

> +    Module::GlobalListType &globalList = const_cast<Module::GlobalListType &> (TheModule->getGlobalList());
> +    for(auto i = globalList.begin(); i != globalList.end(); i ++) {
> +      GlobalVariable &v = *i;
> +      if(!v.isConstantUsed()) continue;
> +
> +      BtiMap.insert(std::make_pair(&v, getNewBti(&v)));
> +    }
> +    NamedMDNode *clKernels = TheModule->getNamedMetadata("opencl.kernels");
> +    MDNode *typeNameNode = NULL;
> +     uint32_t ops = clKernels->getNumOperands();
> +      for(uint32_t x = 0; x < ops; x++) {
> +        MDNode* node = clKernels->getOperand(x);
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
> +        Value * op = node->getOperand(0);
> +#else
> +        Value * op = cast<ValueAsMetadata>(node->getOperand(0))->getValue();
> +#endif
> +        if(op == &F) {
> +          for(uint j = 0; j < node->getNumOperands() - 1; j++) {
> +            MDNode *attrNode = dyn_cast_or_null<MDNode>(node->getOperand(1 + j));
> +            if (attrNode == NULL) break;
> +            MDString *attrName = dyn_cast_or_null<MDString>(attrNode->getOperand(0));
> +            if (!attrName) continue;
> +            if (attrName->getString() == "kernel_arg_type") {
> +              typeNameNode = attrNode;
> +            }
> +          }
> +        }
> +      }
> +
> +    unsigned argID = 0;
> +    for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I, argID++) {
> +      std::string typeName= (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
> +      if (I->getType()->isPointerTy() || isImageType(typeName)) {
> +        BtiMap.insert(std::make_pair(I, getNewBti(I)));
> +      }
> +    }
> +
> +    BasicBlock &bb = F.getEntryBlock();
> +    for (BasicBlock::iterator iter = bb.begin(), iterE = bb.end(); iter != iterE; ++iter) {
> +      if (AllocaInst *ai = dyn_cast<AllocaInst>(iter)) {
> +        BtiMap.insert(std::make_pair(ai, BTI_PRIVATE));
> +      }
> +    }
> +  }
>  
>    void GenWriter::analyzePointerOrigin(Function &F) {
> +    // used to record where the pointers get mixed (i.e. select or phi instruction)
> +    std::set<Value *> mixedPtr;
> +    // This is a two-pass algorithm, the 1st pass will try to update the pointer sources for
> +    // every instruction reachable from pointers and record mix-point in this pass.
> +    // The second pass will start from really mixed-pointer instruction like select or phinode.
> +    // and update the sources correctly. For pointers reachable from mixed-pointer, we will set
> +    // its direct mixed-pointer parent as it's pointer origin.
> +
>      // GlobalVariable
>      Module::GlobalListType &globalList = const_cast<Module::GlobalListType &> (TheModule->getGlobalList());
>      for(auto i = globalList.begin(); i != globalList.end(); i ++) {
>        GlobalVariable &v = *i;
>        if(!v.isConstantUsed()) continue;
> -      findPointerEscape(&v);
> +      findPointerEscape(&v, mixedPtr, true);
>      }
>      // function argument
>      for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
>        if (I->getType()->isPointerTy()) {
> -        findPointerEscape(I);
> +        findPointerEscape(I, mixedPtr, true);
>        }
>      }
>      // alloca
>      BasicBlock &bb = F.getEntryBlock();
>      for (BasicBlock::iterator iter = bb.begin(), iterE = bb.end(); iter != iterE; ++iter) {
>        if (AllocaInst *ai = dyn_cast<AllocaInst>(iter)) {
> -        findPointerEscape(ai);
> +        findPointerEscape(ai, mixedPtr, true);
>        }
>      }
> +    // the second pass starts from mixed pointer
> +    for (std::set<Value *>::iterator iter = mixedPtr.begin(); iter != mixedPtr.end(); ++iter) {
> +      findPointerEscape(*iter, mixedPtr, false);
> +    }
> +
> +    for (std::set<Value *>::iterator iter = mixedPtr.begin(); iter != mixedPtr.end(); ++iter) {
> +      getBtiRegister(*iter);
> +    }
> +    for (std::set<Value *>::iterator iter = mixedPtr.begin(); iter != mixedPtr.end(); ++iter) {
> +      getPointerBase(*iter);
> +    }
>    }
>  
>    void getSequentialData(const ConstantDataSequential *cda, void *ptr, uint32_t &offset) {
> @@ -1419,7 +1796,7 @@ namespace gbe
>          const ir::Register reg = getRegister(I);
>          if (llvmInfo.isImageType()) {
>            ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, 4, 4, 0);
> -          ctx.getFunction().getImageSet()->append(reg, &ctx, incBtiBase());
> +          ctx.getFunction().getImageSet()->append(reg, &ctx, BtiMap.find(I)->second);
>            collectImageArgs(llvmInfo.accessQual, imageArgsInfo);
>            continue;
>          }
> @@ -1452,10 +1829,7 @@ namespace gbe
>              const uint32_t align = getAlignmentByte(unit, pointed);
>                switch (addrSpace) {
>                case ir::MEM_GLOBAL:
> -                globalPointer.insert(std::make_pair(I, btiBase));
> -                ctx.appendSurface(btiBase, reg);
> -                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, btiBase);
> -                incBtiBase();
> +                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, BtiMap.find(I)->second);
>                break;
>                case ir::MEM_LOCAL:
>                  ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg,  llvmInfo, ptrSize, align, BTI_LOCAL);
> @@ -1806,14 +2180,10 @@ namespace gbe
>          ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
>        } else {
>          if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
> -          ctx.appendSurface(btiBase, ir::ocl::printfbptr);
> -          ctx.getFunction().getPrintfSet()->setBufBTI(btiBase);
> -          globalPointer.insert(std::make_pair(&v, incBtiBase()));
> +          ctx.getFunction().getPrintfSet()->setBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
>            regTranslator.newScalarProxy(ir::ocl::printfbptr, const_cast<GlobalVariable*>(&v));
>          } else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
> -          ctx.appendSurface(btiBase, ir::ocl::printfiptr);
> -          ctx.getFunction().getPrintfSet()->setIndexBufBTI(btiBase);
> -          globalPointer.insert(std::make_pair(&v, incBtiBase()));
> +          ctx.getFunction().getPrintfSet()->setIndexBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
>            regTranslator.newScalarProxy(ir::ocl::printfiptr, const_cast<GlobalVariable*>(&v));
>          } else if(v.getName().str().substr(0, 4) == ".str") {
>            /* When there are multi printf statements in multi kernel fucntions within the same
> @@ -2045,6 +2415,7 @@ namespace gbe
>      }
>  
>      ctx.startFunction(F.getName());
> +
>      ir::Function &fn = ctx.getFunction();
>      this->regTranslator.clear();
>      this->labelMap.clear();
> @@ -2837,19 +3208,46 @@ namespace gbe
>      CallSite::arg_iterator AE = CS.arg_end();
>      GBE_ASSERT(AI != AE);
>  
> +    ir::AddressSpace addrSpace;
> +
> +    Value *llvmPtr = *AI;
> +    Value *bti = getBtiRegister(llvmPtr);
> +    Value *ptrBase = getPointerBase(llvmPtr);
> +    ir::Register pointer = this->getRegister(llvmPtr);
> +    ir::Register baseReg = this->getRegister(ptrBase);
> +
> +    ir::Register btiReg;
> +    bool fixedBTI = false;
> +    if (isa<ConstantInt>(bti)) {
> +      fixedBTI = true;
> +      unsigned index = cast<ConstantInt>(bti)->getZExtValue();
> +      addrSpace = btiToGen(index);
> +      ir::ImmediateIndex immIndex = ctx.newImmediate((uint32_t)index);
> +      btiReg = ctx.reg(ir::FAMILY_DWORD);
> +      ctx.LOADI(ir::TYPE_U32, btiReg, immIndex);
> +    } else {
> +      addrSpace = ir::MEM_MIXED;
> +      btiReg = this->getRegister(bti);
> +    }
> +
> +    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
> +    const ir::Register ptr = ctx.reg(pointerFamily);
> +    ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
> +
>      const ir::Register dst = this->getRegister(&I);
>  
> -    ir::BTI bti;
> -    gatherBTI(&I, bti);
> -    const ir::AddressSpace addrSpace = btiToGen(bti);
> -    vector<ir::Register> src;
>      uint32_t srcNum = 0;
> +    vector<ir::Register> src;
> +    src.push_back(ptr);
> +    srcNum++;
> +    AI++;
> +
>      while(AI != AE) {
>        src.push_back(this->getRegister(*(AI++)));
>        srcNum++;
>      }
>      const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
> -    ctx.ATOMIC(opcode, dst, addrSpace, bti, srcTuple);
> +    ctx.ATOMIC(opcode, dst, addrSpace, btiReg, fixedBTI, srcTuple);
>    }
>  
>    /* append a new sampler. should be called before any reference to
> @@ -3546,8 +3944,8 @@ namespace gbe
>    void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
>                                        Value *llvmValues, const ir::Register ptr,
>                                        const ir::AddressSpace addrSpace,
> -                                      Type * elemType, bool isLoad, ir::BTI bti,
> -                                      bool dwAligned) {
> +                                      Type * elemType, bool isLoad, ir::Register bti,
> +                                      bool dwAligned, bool fixedBTI) {
>      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
>      uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
>      uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
> @@ -3593,79 +3991,18 @@ namespace gbe
>  
>        // Emit the instruction
>        if (isLoad)
> -        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
> +        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, fixedBTI, bti);
>        else
> -        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
> -    }
> -  }
> -
> -  // The idea behind is to search along the use-def chain, and find out all
> -  // possible sources of the pointer. Then in later codeGen, we can emit
> -  // read/store instructions to these BTIs gathered.
> -  void GenWriter::gatherBTI(Value *insn, ir::BTI &bti) {
> -    PtrOrigMapIter iter = pointerOrigMap.find(insn);
> -    if (iter != pointerOrigMap.end()) {
> -      SmallVectorImpl<Value *> &origins = iter->second;
> -      uint8_t nBTI = 0;
> -      for (unsigned i = 0; i < origins.size(); i++) {
> -        uint8_t new_bti = 0;
> -        Value *origin = origins[i];
> -        // all constant put into constant cache, including __constant & const __private
> -        if (isa<GlobalVariable>(origin)
> -            && dyn_cast<GlobalVariable>(origin)->isConstant()) {
> -          new_bti = BTI_CONSTANT;
> -        } else {
> -          unsigned space = origin->getType()->getPointerAddressSpace();
> -          switch (space) {
> -            case 0:
> -              new_bti = BTI_PRIVATE;
> -              break;
> -            case 1:
> -            {
> -              GlobalPtrIter iter = globalPointer.find(origin);
> -              GBE_ASSERT(iter != globalPointer.end());
> -              new_bti = iter->second;
> -              break;
> -            }
> -            case 2:
> -              new_bti = BTI_CONSTANT;
> -              break;
> -            case 3:
> -              new_bti = BTI_LOCAL;
> -              break;
> -            default:
> -              GBE_ASSERT(0 && "address space not unhandled in gatherBTI()\n");
> -              break;
> -          }
> -        }
> -
> -        // avoid duplicate
> -        bool bFound = false;
> -        for (int j = 0; j < nBTI; j++) {
> -          if (bti.bti[j] == new_bti) {
> -            bFound = true; break;
> -          }
> -        }
> -        if (bFound == false) {
> -          bti.bti[nBTI++] = new_bti;
> -          bti.count = nBTI;
> -        }
> -      }
> -    } else {
> -      insn->dump();
> -      std::cerr << "Illegal pointer which is not from a valid memory space." << std::endl;
> -      std::cerr << "Aborting..." << std::endl;
> -      exit(-1);
> +        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, fixedBTI, bti);
>      }
> -    GBE_ASSERT(bti.count <= MAX_MIXED_POINTER);
>    }
> +
>    // handle load of dword/qword with unaligned address
> -  void GenWriter::emitUnalignedDQLoadStore(Value *llvmPtr, Value *llvmValues, ir::AddressSpace addrSpace, ir::BTI &binding, bool isLoad, bool dwAligned)
> +  void GenWriter::emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI)
>    {
>      Type *llvmType = llvmValues->getType();
>      const ir::Type type = getType(ctx, llvmType);
>      unsigned byteSize = getTypeByteSize(unit, llvmType);
> -    const ir::Register ptr = this->getRegister(llvmPtr);
>  
>      Type *elemType = llvmType;
>      unsigned elemNum = 1;
> @@ -3695,13 +4032,13 @@ namespace gbe
>      const ir::Tuple byteTuple = ctx.arrayTuple(&byteTupleData[0], byteSize);
>  
>      if (isLoad) {
> -      ctx.LOAD(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, binding);
> +      ctx.LOAD(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, fixedBTI, bti);
>        ctx.BITCAST(type, ir::TYPE_U8, tuple, byteTuple, elemNum, byteSize);
>      } else {
>        ctx.BITCAST(ir::TYPE_U8, type, byteTuple, tuple, byteSize, elemNum);
>        // FIXME: byte scatter does not handle correctly vector store, after fix that,
>        //        we can directly use on store instruction like:
> -      //        ctx.STORE(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, binding);
> +      //        ctx.STORE(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, fixedBTI, bti);
>        const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
>        for (uint32_t elemID = 0; elemID < byteSize; elemID++) {
>          const ir::Register reg = byteTupleData[elemID];
> @@ -3716,7 +4053,7 @@ namespace gbe
>            ctx.LOADI(ir::TYPE_S32, offset, immIndex);
>            ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
>          }
> -       ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
> +       ctx.STORE(type, addr, addrSpace, dwAligned, fixedBTI, bti, reg);
>        }
>      }
>    }
> @@ -3729,10 +4066,31 @@ namespace gbe
>      Value *llvmValues = getLoadOrStoreValue(I);
>      Type *llvmType = llvmValues->getType();
>      const bool dwAligned = (I.getAlignment() % 4) == 0;
> -    const ir::Register ptr = this->getRegister(llvmPtr);
> -    ir::BTI binding;
> -    gatherBTI(&I, binding);
> -    const ir::AddressSpace addrSpace = btiToGen(binding);
> +    ir::AddressSpace addrSpace;
> +    const ir::Register pointer = this->getRegister(llvmPtr);
> +    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
> +
> +    Value *bti = getBtiRegister(llvmPtr);
> +    Value *ptrBase = getPointerBase(llvmPtr);
> +    ir::Register baseReg = this->getRegister(ptrBase);
> +    bool zeroBase = false;
> +    if (isa<ConstantPointerNull>(ptrBase)) {
> +      zeroBase = true;
> +    }
> +
> +    ir::Register btiReg;
> +    bool fixedBTI = false;
> +    if (isa<ConstantInt>(bti)) {
> +      fixedBTI = true;
> +      unsigned index = cast<ConstantInt>(bti)->getZExtValue();
> +      addrSpace = btiToGen(index);
> +      ir::ImmediateIndex immIndex = ctx.newImmediate((uint32_t)index);
> +      btiReg = ctx.reg(ir::FAMILY_DWORD);
> +      ctx.LOADI(ir::TYPE_U32, btiReg, immIndex);
> +    } else {
> +      addrSpace = ir::MEM_MIXED;
> +      btiReg = this->getRegister(bti);
> +    }
>  
>      Type *scalarType = llvmType;
>      if (!isScalarType(llvmType)) {
> @@ -3740,11 +4098,20 @@ namespace gbe
>        scalarType = vectorType->getElementType();
>      }
>  
> +    ir::Register ptr = ctx.reg(pointerFamily);
> +    // FIXME: avoid subtraction zero at this stage is not a good idea,
> +    // but later ArgumentLower pass need to match exact load/addImm pattern
> +    // so, I avoid subtracting zero base to satisfy ArgumentLower pass.
> +    if (!zeroBase)
> +      ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
> +    else
> +      ptr = pointer;
> +
>      if (!dwAligned
>         && (scalarType == IntegerType::get(I.getContext(), 64)
>            || scalarType == IntegerType::get(I.getContext(), 32))
>         ) {
> -      emitUnalignedDQLoadStore(llvmPtr, llvmValues, addrSpace, binding, isLoad, dwAligned);
> +      emitUnalignedDQLoadStore(ptr, llvmValues, addrSpace, btiReg, isLoad, dwAligned, fixedBTI);
>        return;
>      }
>      // Scalar is easy. We neednot build register tuples
> @@ -3752,9 +4119,9 @@ namespace gbe
>        const ir::Type type = getType(ctx, llvmType);
>        const ir::Register values = this->getRegister(llvmValues);
>        if (isLoad)
> -        ctx.LOAD(type, ptr, addrSpace, dwAligned, binding, values);
> +        ctx.LOAD(type, ptr, addrSpace, dwAligned, fixedBTI, btiReg, values);
>        else
> -        ctx.STORE(type, ptr, addrSpace, dwAligned, binding, values);
> +        ctx.STORE(type, ptr, addrSpace, dwAligned, fixedBTI, btiReg, values);
>      }
>      // A vector type requires to build a tuple
>      else {
> @@ -3776,10 +4143,9 @@ namespace gbe
>        // The code is going to be fairly different from types to types (based on
>        // size of each vector element)
>        const ir::Type type = getType(ctx, elemType);
> -      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
>        const ir::RegisterFamily dataFamily = getFamily(type);
>  
> -      if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT && addrSpace != ir::MEM_MIXED) {
> +      if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT) {
>          // One message is enough here. Nothing special to do
>          if (elemNum <= 4) {
>            // Build the tuple data in the vector
> @@ -3798,19 +4164,19 @@ namespace gbe
>  
>            // Emit the instruction
>            if (isLoad)
> -            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
> +            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, fixedBTI, btiReg);
>            else
> -            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
> +            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, fixedBTI, btiReg);
>          }
>          // Not supported by the hardware. So, we split the message and we use
>          // strided loads and stores
>          else {
> -          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
> +          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, btiReg, dwAligned, fixedBTI);
>          }
>        }
>        else if((dataFamily == ir::FAMILY_WORD && (isLoad || elemNum % 2 == 0)) ||
>                (dataFamily == ir::FAMILY_BYTE && (isLoad || elemNum % 4 == 0))) {
> -          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
> +          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, btiReg, dwAligned, fixedBTI);
>        } else {
>          for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
>            if(regTranslator.isUndefConst(llvmValues, elemID))
> @@ -3830,9 +4196,9 @@ namespace gbe
>                ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
>            }
>            if (isLoad)
> -           ctx.LOAD(type, addr, addrSpace, dwAligned, binding, reg);
> +           ctx.LOAD(type, addr, addrSpace, dwAligned, fixedBTI, btiReg, reg);
>            else
> -           ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
> +           ctx.STORE(type, addr, addrSpace, dwAligned, fixedBTI, btiReg, reg);
>          }
>        }
>      }
> -- 
> 2.3.6
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list