[Beignet] [PATCH V3] GBE: Refine ir for memory operation like atomic/load/store

Yang, Rong R rong.r.yang at intel.com
Thu Nov 5 23:01:27 PST 2015


Pushed.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Ruiling Song
> Sent: Wednesday, November 4, 2015 15:50
> To: beignet at lists.freedesktop.org
> Cc: Song, Ruiling
> Subject: [Beignet] [PATCH V3] GBE: Refine ir for memory operation like
> atomic/load/store
> 
> the legacyMode means what kind of address mode to choose.
> when legacyMode is true, we need to do complex bti analysis.
> 
> dynamicBti and staticBti are most for platforms before BDW.
> And stateless is for platform BDW+
> 
> v2:
>  only do analyzePointerOrigin() under legacyMode.
> 
> v3:
>  fix conflict with master, and some reorder warning.
> 
> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> ---
>  backend/src/backend/gen_insn_selection.cpp | 132 ++++------
>  backend/src/ir/context.hpp                 |  19 --
>  backend/src/ir/instruction.cpp             | 410 +++++++++++++++++------------
>  backend/src/ir/instruction.hpp             |  78 +++---
>  backend/src/ir/lowering.cpp                |   4 +-
>  backend/src/llvm/llvm_gen_backend.cpp      | 393 +++++++++++++-----------
> ---
>  6 files changed, 531 insertions(+), 505 deletions(-)
> 
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 2452aea..5ec420e 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -1254,11 +1254,11 @@ namespace gbe
>    }
> 
>    void Selection::Opaque::ATOMIC(Reg dst, uint32_t function,
> -                                 uint32_t srcNum, Reg src0,
> +                                 uint32_t msgPayload, Reg src0,
>                                   Reg src1, Reg src2, GenRegister bti,
>                                   vector<GenRegister> temps) {
>      unsigned dstNum = 1 + temps.size();
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, dstNum,
> srcNum + 1);
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, dstNum,
> msgPayload + 1);
> 
>      if (bti.file != GEN_IMMEDIATE_VALUE) {
>        insn->state.flag = 0;
> @@ -1272,14 +1272,15 @@ namespace gbe
>      }
> 
>      insn->src(0) = src0;
> -    if(srcNum > 1) insn->src(1) = src1;
> -    if(srcNum > 2) insn->src(2) = src2;
> -    insn->src(srcNum) = bti;
> +    if(msgPayload > 1) insn->src(1) = src1;
> +    if(msgPayload > 2) insn->src(2) = src2;
> +    insn->src(msgPayload) = bti;
> +
>      insn->extra.function = function;
> -    insn->extra.elem = srcNum;
> +    insn->extra.elem = msgPayload;
> 
>      SelectionVector *vector = this->appendVector();
> -    vector->regNum = srcNum;
> +    vector->regNum = msgPayload; //bti not included in SelectionVector
>      vector->offsetID = 0;
>      vector->reg = &insn->src(0);
>      vector->isSrc = 1;
> @@ -3424,8 +3425,6 @@ namespace gbe
>                     uint32_t valueNum,
>                     ir::BTI bti) const
>      {
> -        //GenRegister temp = getRelativeAddress(sel, addr, sel.selReg(bti.base,
> ir::TYPE_U32));
> -
>          GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) :
> sel.selReg(bti.reg, ir::TYPE_U32);
>          sel.UNTYPED_READ(addr, dst.data(), valueNum, b,
> sel.getBTITemps(bti));
>      }
> @@ -3726,28 +3725,12 @@ namespace gbe
>        return false;
>      }
> 
> -    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::LoadInstruction &insn)
> const {
> -      using namespace ir;
> -      SelectionDAG *child0 = dag.child[0];
> -      ir::BTI b;
> -      if (insn.isFixedBTI()) {
> -        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
> -        const auto imm = immInsn.getImmediate();
> -        b.isConst = 1;
> -        b.imm = imm.getIntegerValue();
> -      } else {
> -        b.isConst = 0;
> -        b.reg = insn.getBTI();
> -      }
> -      return b;
> -    }
> -
>      /*! Implements base class */
>      virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
>      {
>        using namespace ir;
>        const ir::LoadInstruction &insn = cast<ir::LoadInstruction>(dag.insn);
> -      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
> +      GenRegister address = sel.selReg(insn.getAddressRegister(),
> ir::TYPE_U32);
>        GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
>                   insn.getAddressSpace() == MEM_CONSTANT ||
>                   insn.getAddressSpace() == MEM_PRIVATE ||
> @@ -3755,8 +3738,17 @@ namespace gbe
>                   insn.getAddressSpace() == MEM_MIXED);
>        //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
> 
> -      BTI bti = getBTI(dag, insn);
> -
> +      BTI bti;
> +      AddressMode am = insn.getAddressMode();
> +      if (am == AM_StaticBti) {
> +        bti.isConst = 1;
> +        bti.imm = insn.getSurfaceIndex();
> +      } else if (am == AM_DynamicBti) {
> +        bti.isConst = 0;
> +        bti.reg = insn.getBtiReg();
> +      } else {
> +        assert(0 && "stateless not supported yet");
> +      }
>        const Type type = insn.getValueType();
>        const uint32_t elemSize = getByteScatterGatherSize(sel, type);
>        bool allConstant = isAllConstant(bti);
> @@ -3784,12 +3776,7 @@ namespace gbe
>            this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
>        }
> 
> -
> -      // for fixed bti, don't generate the useless loadi
> -      if (insn.isFixedBTI())
> -        dag.child[0] = NULL;
>        markAllChildren(dag);
> -
>        return true;
>      }
>    };
> @@ -3893,32 +3880,26 @@ namespace gbe
>        }
>      }
> 
> -
> -    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::StoreInstruction &insn)
> const {
> -      using namespace ir;
> -      SelectionDAG *child0 = dag.child[0];
> -      ir::BTI b;
> -      if (insn.isFixedBTI()) {
> -        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
> -        const auto imm = immInsn.getImmediate();
> -        b.isConst = 1;
> -        b.imm = imm.getIntegerValue();
> -      } else {
> -        b.isConst = 0;
> -        b.reg = insn.getBTI();
> -      }
> -      return b;
> -    }
>      virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
>      {
>        using namespace ir;
>        const ir::StoreInstruction &insn = cast<ir::StoreInstruction>(dag.insn);
> -      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
> +      GenRegister address = sel.selReg(insn.getAddressRegister(),
> ir::TYPE_U32);
>        const Type type = insn.getValueType();
>        const uint32_t elemSize = getByteScatterGatherSize(sel, type);
> 
> -      const bool isUniform = sel.isScalarReg(insn.getAddress()) &&
> sel.isScalarReg(insn.getValue(0));
> -      BTI bti = getBTI(dag, insn);
> +      const bool isUniform = sel.isScalarReg(insn.getAddressRegister()) &&
> sel.isScalarReg(insn.getValue(0));
> +      BTI bti;
> +      AddressMode am = insn.getAddressMode();
> +      if (am == AM_StaticBti) {
> +        bti.isConst = 1;
> +        bti.imm = insn.getSurfaceIndex();
> +      } else if (am == AM_DynamicBti) {
> +        bti.isConst = 0;
> +        bti.reg = insn.getBtiReg();
> +      } else {
> +        assert(0 && "stateless not supported yet");
> +      }
> 
>        if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
>          this->emitWrite64(sel, insn, address, bti);
> @@ -3928,11 +3909,7 @@ namespace gbe
>          this->emitByteScatter(sel, insn, elemSize, address, bti, isUniform);
>        }
> 
> -      // for fixed bti, don't generate the useless loadi
> -      if (insn.isFixedBTI())
> -        dag.child[0] = NULL;
>        markAllChildren(dag);
> -
>        return true;
>      }
>    };
> @@ -4795,47 +4772,36 @@ namespace gbe
>            this->opcodes.push_back(ir::Opcode(op));
>      }
> 
> -    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::AtomicInstruction &insn)
> const {
> -      using namespace ir;
> -      SelectionDAG *child0 = dag.child[0];
> -      ir::BTI b;
> -      if (insn.isFixedBTI()) {
> -        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
> -        const auto imm = immInsn.getImmediate();
> -        b.isConst = 1;
> -        b.imm = imm.getIntegerValue();
> -      } else {
> -        b.isConst = 0;
> -        b.reg = insn.getBTI();
> -      }
> -      return b;
> -    }
> -
>      INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
>        using namespace ir;
>        const ir::AtomicInstruction &insn = cast<ir::AtomicInstruction>(dag.insn);
> 
> -      ir::BTI b = getBTI(dag, insn);
> +      ir::BTI b;
>        const AtomicOps atomicOp = insn.getAtomicOpcode();
>        unsigned srcNum = insn.getSrcNum();
> -      unsigned opNum = srcNum - 1;
> +      unsigned msgPayload;
> +
> +      AddressMode AM = insn.getAddressMode();
> +      if (AM == AM_DynamicBti) {
> +        b.reg = insn.getBtiReg();
> +        msgPayload = srcNum - 1;
> +      } else {
> +        b.imm = insn.getSurfaceIndex();
> +        b.isConst = 1;
> +        msgPayload = srcNum;
> +      }
> 
>        GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
>        GenRegister bti =  b.isConst ? GenRegister::immud(b.imm) :
> sel.selReg(b.reg, ir::TYPE_U32);
> -      GenRegister src0 = sel.selReg(insn.getSrc(1), TYPE_U32);   //address
> +      GenRegister src0 = sel.selReg(insn.getAddressRegister(), TYPE_U32);
>        GenRegister src1 = src0, src2 = src0;
> -      if(srcNum > 2) src1 = sel.selReg(insn.getSrc(2), TYPE_U32);
> -      if(srcNum > 3) src2 = sel.selReg(insn.getSrc(3), TYPE_U32);
> +      if(msgPayload > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
> +      if(msgPayload > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
> 
>        GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
> +      sel.ATOMIC(dst, genAtomicOp, msgPayload, src0, src1, src2, bti,
> sel.getBTITemps(b));
> 
> -      sel.ATOMIC(dst, genAtomicOp, opNum, src0, src1, src2, bti,
> sel.getBTITemps(b));
> -
> -      // for fixed bti, don't generate the useless loadi
> -      if (insn.isFixedBTI())
> -        dag.child[0] = NULL;
>        markAllChildren(dag);
> -
>        return true;
>      }
>    };
> diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
> index 0f7ded4..ab0d8b5 100644
> --- a/backend/src/ir/context.hpp
> +++ b/backend/src/ir/context.hpp
> @@ -189,25 +189,6 @@ namespace ir {
>        this->append(insn);
>      }
> 
> -    /*! LOAD with the destinations directly specified */
> -    template <typename... Args>
> -    void LOAD(Type type, Register offset, AddressSpace space, bool
> dwAligned, bool fixedBTI, Register bti, Args...values)
> -    {
> -      const Tuple index = this->tuple(values...);
> -      const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
> -      GBE_ASSERT(valueNum > 0);
> -      this->LOAD(type, index, offset, space, valueNum, dwAligned, fixedBTI,
> bti);
> -    }
> -
> -    /*! STORE with the sources directly specified */
> -    template <typename... Args>
> -    void STORE(Type type, Register offset, AddressSpace space, bool
> dwAligned, bool fixedBTI, Register bti, Args...values)
> -    {
> -      const Tuple index = this->tuple(values...);
> -      const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
> -      GBE_ASSERT(valueNum > 0);
> -      this->STORE(type, index, offset, space, valueNum, dwAligned, fixedBTI,
> bti);
> -    }
>      void appendSurface(uint8_t bti, Register reg) { fn->appendSurface(bti,
> reg); }
> 
>    protected:
> diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
> index f93c528..0246920 100644
> --- a/backend/src/ir/instruction.cpp
> +++ b/backend/src/ir/instruction.cpp
> @@ -316,60 +316,104 @@ namespace ir {
>        Type srcType; //!< Type to convert from
>      };
> 
> +    class ALIGNED_INSTRUCTION MemInstruction :
> +      public BasePolicy
> +    {
> +    public:
> +      MemInstruction(AddressMode   _AM,
> +                     AddressSpace _AS,
> +                     bool _dwAligned,
> +                     Type _type,
> +                     Register _offset)
> +                   : AM(_AM),
> +                     AS(_AS),
> +                     dwAligned(_dwAligned),
> +                     type(_type),
> +                     SurfaceIndex(0),
> +                     offset(_offset) {
> +      }
> +      AddressMode  getAddressMode()    const { return AM; }
> +      AddressSpace getAddressSpace()   const { return AS; }
> +      /*! MemInstruction may have one possible btiReg */
> +      Register     getBtiReg()         const { assert(AM == AM_DynamicBti); return
> BtiReg; }
> +      unsigned     getSurfaceIndex()   const { assert(AM != AM_DynamicBti);
> return SurfaceIndex; }
> +      Register     getAddressRegister()const { return offset; }
> +      unsigned     getAddressIndex()   const { return 0; }
> +      Type         getValueType()      const { return type; }
> +      INLINE bool  isAligned(void)     const { return !!dwAligned; }
> +
> +      void         setSurfaceIndex (unsigned id)  { SurfaceIndex = id; }
> +      void         setBtiReg(Register reg)        { BtiReg = reg;      }
> +    protected:
> +      /*! including address reg + optional bti reg */
> +      int          getBaseSrcNum()    const { return AM == AM_DynamicBti ? 2 : 1; }
> +      bool         hasExtraBtiReg()   const { return AM == AM_DynamicBti; }
> +      AddressMode       AM;
> +      AddressSpace      AS;
> +      uint8_t           dwAligned : 1;
> +      Type              type;
> +      union {
> +        Register        BtiReg;
> +        unsigned        SurfaceIndex;
> +      };
> +      Register          offset;
> +    };
> +
>      class ALIGNED_INSTRUCTION AtomicInstruction :
> -      public BasePolicy,
> +      public MemInstruction,
>        public NDstPolicy<AtomicInstruction, 1>
>      {
>      public:
>        AtomicInstruction(AtomicOps atomicOp,
> +                         Type type,
>                           Register dst,
>                           AddressSpace addrSpace,
> -                         Register bti,
> -                         bool fixedBTI,
> -                         Tuple src)
> +                         Register address,
> +                         Tuple payload,
> +                         AddressMode AM)
> +        : MemInstruction(AM, addrSpace, true, type, address)
>        {
>          this->opcode = OP_ATOMIC;
>          this->atomicOp = atomicOp;
>          this->dst[0] = dst;
> -        this->src = src;
> -        this->addrSpace = addrSpace;
> -        this->bti = bti;
> -        this->fixedBTI = fixedBTI ? 1: 0;
> -        srcNum = 2;
> +        this->payload = payload;
> +
> +        int payloadNum = 1;
>          if((atomicOp == ATOMIC_OP_INC) ||
>            (atomicOp == ATOMIC_OP_DEC))
> -          srcNum = 1;
> +          payloadNum = 0;
>          if(atomicOp == ATOMIC_OP_CMPXCHG)
> -          srcNum = 3;
> -        srcNum++;
> +          payloadNum = 2;
> +
> +        srcNum = payloadNum + getBaseSrcNum();
>        }
>        INLINE Register getSrc(const Function &fn, uint32_t ID) const {
> -        GBE_ASSERTM(ID < srcNum, "Out-of-bound source register for atomic");
> -        if (ID == 0u)
> -          return bti;
> -        else
> -          return fn.getRegister(src, ID -1);
> +        GBE_ASSERTM((int)ID < (int)srcNum, "Out-of-bound source register for
> atomic");
> +        if (ID == 0) {
> +          return offset;
> +        } else if (hasExtraBtiReg() && (int)ID == (int)srcNum-1) {
> +          return getBtiReg();
> +        } else {
> +          return fn.getRegister(payload, ID - 1);
> +        }
>        }
>        INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
> -        GBE_ASSERTM(ID < srcNum, "Out-of-bound source register for atomic");
> -        if (ID == 0u)
> -          bti = reg;
> -        else
> -          fn.setRegister(src, ID - 1, reg);
> +        GBE_ASSERTM((int)ID < (int)srcNum, "Out-of-bound source register for
> atomic");
> +        if (ID == 0) {
> +          offset = reg;
> +        } else if (hasExtraBtiReg() && (int)ID == (int)srcNum - 1) {
> +          setBtiReg(reg);
> +        } else {
> +          fn.setRegister(payload, ID - 1, reg);
> +        }
>        }
>        INLINE uint32_t getSrcNum(void) const { return srcNum; }
> 
> -      INLINE AddressSpace getAddressSpace(void) const { return this-
> >addrSpace; }
> -      INLINE Register getBTI(void) const { return bti; }
> -      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
>        INLINE AtomicOps getAtomicOpcode(void) const { return this-
> >atomicOp; }
>        INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
>        INLINE void out(std::ostream &out, const Function &fn) const;
>        Register dst[1];
> -      Tuple src;
> -      AddressSpace addrSpace; //!< Address space
> -      Register bti;               //!< bti
> -      uint8_t fixedBTI:1;      //!< fixed bti or not
> +      Tuple payload;
>        uint8_t srcNum:3;     //!<Source Number
>        AtomicOps atomicOp:6;     //!<Source Number
>      };
> @@ -428,119 +472,102 @@ namespace ir {
>        Register dst[0];       //!< No destination
>      };
> 
> +
>      class ALIGNED_INSTRUCTION LoadInstruction :
> -      public BasePolicy,
> -      public NSrcPolicy<LoadInstruction, 2>
> +      public MemInstruction
>      {
> -    public:
> -      LoadInstruction(Type type,
> -                      Tuple dstValues,
> -                      Register offset,
> -                      AddressSpace addrSpace,
> -                      uint32_t valueNum,
> -                      bool dwAligned,
> -                      bool fixedBTI,
> -                      Register bti)
> -      {
> -        GBE_ASSERT(valueNum < 128);
> -        this->opcode = OP_LOAD;
> -        this->type = type;
> -        this->offset = offset;
> -        this->values = dstValues;
> -        this->addrSpace = addrSpace;
> -        this->valueNum = valueNum;
> -        this->dwAligned = dwAligned ? 1 : 0;
> -        this->fixedBTI = fixedBTI ? 1 : 0;
> -        this->bti = bti;
> -      }
> -      INLINE Register getDst(const Function &fn, uint32_t ID) const {
> -        GBE_ASSERTM(ID < valueNum, "Out-of-bound source register");
> -        return fn.getRegister(values, ID);
> -      }
> -      INLINE void setDst(Function &fn, uint32_t ID, Register reg) {
> -        GBE_ASSERTM(ID < valueNum, "Out-of-bound source register");
> -        fn.setRegister(values, ID, reg);
> -      }
> -      INLINE uint32_t getDstNum(void) const { return valueNum; }
> -      INLINE Type getValueType(void) const { return type; }
> -      INLINE uint32_t getValueNum(void) const { return valueNum; }
> -      INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
> -      INLINE Register getBTI(void) const { return bti; }
> -      INLINE bool wellFormed(const Function &fn, std::string &why) const;
> -      INLINE void out(std::ostream &out, const Function &fn) const;
> -      INLINE bool isAligned(void) const { return !!dwAligned; }
> -      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
> -      Type type;              //!< Type to store
> -      Register src[0];        //!< Address where to load from
> -      Register bti;
> -      Register offset;        //!< Alias to make it similar to store
> -      Tuple values;           //!< Values to load
> -      AddressSpace addrSpace; //!< Where to load
> -      uint8_t fixedBTI:1;
> -      uint8_t valueNum:7;     //!< Number of values to load
> -      uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
> -    };
> +      public:
> +        LoadInstruction(Type type,
> +                        Tuple dstValues,
> +                        Register offset,
> +                        AddressSpace AS,
> +                        uint32_t _valueNum,
> +                        bool dwAligned,
> +                        AddressMode AM)
> +                      : MemInstruction(AM, AS, dwAligned, type, offset),
> +                        valueNum(_valueNum),
> +                        values(dstValues)
> +        {
> +          this->opcode = OP_LOAD;
> +        }
> 
> +        INLINE unsigned getSrcNum() const { return getBaseSrcNum(); }
> +        INLINE Register getSrc(const Function &fn, unsigned id) const {
> +          if (id == 0) return offset;
> +          if (hasExtraBtiReg() && id == 1) return BtiReg;
> +          assert(0 && "LoadInstruction::getSrc() out-of-range");
> +          return ir::Register(0);
> +        }
> +        INLINE void     setSrc(Function &fn, unsigned id, Register reg) {
> +          assert(id < getSrcNum());
> +          if (id == 0) { offset = reg;   return; }
> +          if (id == 1) { setBtiReg(reg); return; }
> +        }
> +        INLINE unsigned getDstNum() const { return valueNum; }
> +        INLINE Register getDst(const Function &fn, unsigned id) const {
> +          assert(id < valueNum);
> +          return fn.getRegister(values, id);
> +        }
> +        INLINE void     setDst(Function &fn, unsigned id, Register reg) {
> +          assert(id < getDstNum());
> +          fn.setRegister(values, id, reg);
> +        }
> +        INLINE uint32_t getValueNum(void) const { return valueNum; }
> +        INLINE Register getValue(const Function &fn, unsigned id) const {
> +          assert(id < valueNum);
> +          return fn.getRegister(values, id);
> +        }
> +        INLINE bool wellFormed(const Function &fn, std::string &why) const;
> +        INLINE void out(std::ostream &out, const Function &fn) const;
> +
> +        uint8_t         valueNum;
> +        Tuple             values;
> +    };
>      class ALIGNED_INSTRUCTION StoreInstruction :
> -      public BasePolicy, public NDstPolicy<StoreInstruction, 0>
> +      public MemInstruction,
> +      public NDstPolicy<StoreInstruction, 0>
>      {
> -    public:
> -      StoreInstruction(Type type,
> -                       Tuple values,
> -                       Register offset,
> -                       AddressSpace addrSpace,
> -                       uint32_t valueNum,
> -                       bool dwAligned,
> -                       bool fixedBTI,
> -                       Register bti)
> -      {
> -        GBE_ASSERT(valueNum < 255);
> -        this->opcode = OP_STORE;
> -        this->type = type;
> -        this->offset = offset;
> -        this->values = values;
> -        this->addrSpace = addrSpace;
> -        this->valueNum = valueNum;
> -        this->dwAligned = dwAligned ? 1 : 0;
> -        this->fixedBTI = fixedBTI ? 1 : 0;
> -        this->bti = bti;
> -      }
> -      INLINE Register getSrc(const Function &fn, uint32_t ID) const {
> -        GBE_ASSERTM(ID < valueNum + 2u, "Out-of-bound source register for
> store");
> -        if (ID == 0u)
> -          return bti;
> -        else if (ID == 1u)
> -          return offset;
> -        else
> -          return fn.getRegister(values, ID - 2);
> -      }
> -      INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
> -        GBE_ASSERTM(ID < valueNum + 2u, "Out-of-bound source register for
> store");
> -        if (ID == 0u)
> -          bti = reg;
> -        else if (ID == 1u)
> -          offset = reg;
> -        else
> -          fn.setRegister(values, ID - 2, reg);
> -      }
> -      INLINE uint32_t getSrcNum(void) const { return valueNum + 2u; }
> -      INLINE uint32_t getValueNum(void) const { return valueNum; }
> -      INLINE Type getValueType(void) const { return type; }
> -      INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
> -      INLINE Register getBTI(void) const { return bti; }
> -      INLINE bool wellFormed(const Function &fn, std::string &why) const;
> -      INLINE void out(std::ostream &out, const Function &fn) const;
> -      INLINE bool isAligned(void) const { return !!dwAligned; }
> -      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
> -      Type type;              //!< Type to store
> -      Register bti;
> -      Register offset;        //!< First source is the offset where to store
> -      Tuple values;           //!< Values to store
> -      AddressSpace addrSpace; //!< Where to store
> -      uint8_t fixedBTI:1;                //!< Which btis need access
> -      uint8_t valueNum:7;     //!< Number of values to store
> -      uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
> -      Register dst[0];        //!< No destination
> +      public:
> +        StoreInstruction(Type type,
> +                         Tuple values,
> +                         Register offset,
> +                         AddressSpace addrSpace,
> +                         uint32_t valueNum,
> +                         bool dwAligned,
> +                         AddressMode AM)
> +          : MemInstruction(AM, addrSpace, dwAligned, type, offset)
> +        {
> +          this->opcode = OP_STORE;
> +          this->values = values;
> +          this->valueNum = valueNum;
> +        }
> +        INLINE unsigned getValueNum()      const { return valueNum; }
> +        INLINE Register getValue(const Function &fn, unsigned id) const {
> +          return fn.getRegister(values, id);
> +        }
> +        INLINE unsigned getSrcNum()        const { return getBaseSrcNum() +
> valueNum; }
> +        INLINE Register getSrc(const Function &fn, unsigned id) const {
> +          if (id == 0)  return offset;
> +          if (id <= valueNum) return fn.getRegister(values, id-1);
> +          if (hasExtraBtiReg() && (int)id == (int)valueNum+1) return getBtiReg();
> +          assert(0 && "StoreInstruction::getSrc() out-of-range");
> +          return Register(0);
> +        }
> +        INLINE void     setSrc(Function &fn, unsigned id, Register reg) {
> +          if (id == 0)                   { offset = reg; return; }
> +          if (id > 0 && id <= valueNum)  { fn.setRegister(values, id-1, reg);
> return; }
> +          if (hasExtraBtiReg() &&
> +              (int)id == (int)valueNum + 1)        {
> +            setBtiReg(reg);
> +            return;
> +          }
> +          assert(0 && "StoreInstruction::setSrc() index out-of-range");
> +        }
> +        INLINE bool wellFormed(const Function &fn, std::string &why) const;
> +        INLINE void out(std::ostream &out, const Function &fn) const;
> +        Register      dst[0];
> +        uint8_t     valueNum;
> +        Tuple         values;
>      };
> 
>      class ALIGNED_INSTRUCTION SampleInstruction : // TODO
> @@ -1037,8 +1064,6 @@ namespace ir {
>          if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID+1u),
> fn, whyNot) == false))
>            return false;
> 
> -      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, bti, fn, whyNot) ==
> false))
> -        return false;
>        return true;
>      }
> 
> @@ -1065,7 +1090,7 @@ namespace ir {
>      template <typename T>
>      INLINE bool wellFormedLoadStore(const T &insn, const Function &fn,
> std::string &whyNot)
>      {
> -      if (UNLIKELY(insn.offset >= fn.regNum())) {
> +      if (UNLIKELY(insn.getAddressRegister() >= fn.regNum())) {
>          whyNot = "Out-of-bound offset register index";
>          return false;
>        }
> @@ -1073,10 +1098,11 @@ namespace ir {
>          whyNot = "Out-of-bound tuple index";
>          return false;
>        }
> +
>        // Check all registers
> -      const RegisterFamily family = getFamily(insn.type);
> -      for (uint32_t valueID = 0; valueID < insn.valueNum; ++valueID) {
> -        const Register regID = fn.getRegister(insn.values, valueID);
> +      const RegisterFamily family = getFamily(insn.getValueType());
> +      for (uint32_t valueID = 0; valueID < insn.getValueNum(); ++valueID) {
> +        const Register regID = insn.getValue(fn, valueID);;
>          if (UNLIKELY(checkRegisterData(family, regID, fn, whyNot) == false))
>            return false;
>        }
> @@ -1260,12 +1286,18 @@ namespace ir {
> 
>      INLINE void AtomicInstruction::out(std::ostream &out, const Function &fn)
> const {
>        this->outOpcode(out);
> -      out << "." << addrSpace;
> +      out << "." << AS;
>        out << " %" << this->getDst(fn, 0);
> -      out << " {" << "%" << this->getSrc(fn, 1) << "}";
> -      for (uint32_t i = 2; i < srcNum; ++i)
> +      out << " {" << "%" << this->getSrc(fn, 0) << "}";
> +      for (uint32_t i = 1; i < srcNum; ++i)
>          out << " %" << this->getSrc(fn, i);
> -      out <<  (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
> +      AddressMode am = this->getAddressMode();
> +      out << " bti:";
> +      if ( am == AM_DynamicBti) {
> +        out << " %" << this->getBtiReg();
> +      } else {
> +        out << this->getSurfaceIndex();
> +      }
>      }
> 
> 
> @@ -1294,23 +1326,35 @@ namespace ir {
> 
>      INLINE void LoadInstruction::out(std::ostream &out, const Function &fn)
> const {
>        this->outOpcode(out);
> -      out << "." << type << "." << addrSpace << (dwAligned ? "." : ".un") <<
> "aligned";
> +      out << "." << type << "." << AS << (dwAligned ? "." : ".un") << "aligned";
>        out << " {";
>        for (uint32_t i = 0; i < valueNum; ++i)
>          out << "%" << this->getDst(fn, i) << (i != (valueNum-1u) ? " " : "");
>        out << "}";
> -      out << " %" << this->getSrc(fn, 1);
> -      out << (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
> +      out << " %" << this->getSrc(fn, 0);
> +      AddressMode am = this->getAddressMode();
> +      out << " bti:";
> +      if ( am == AM_DynamicBti) {
> +        out << " %" << this->getBtiReg();
> +      } else {
> +        out << this->getSurfaceIndex();
> +      }
>      }
> 
>      INLINE void StoreInstruction::out(std::ostream &out, const Function &fn)
> const {
>        this->outOpcode(out);
> -      out << "." << type << "." << addrSpace << (dwAligned ? "." : ".un") <<
> "aligned";
> -      out << " %" << this->getSrc(fn, 1) << " {";
> +      out << "." << type << "." << AS << (dwAligned ? "." : ".un") << "aligned";
> +      out << " %" << this->getSrc(fn, 0) << " {";
>        for (uint32_t i = 0; i < valueNum; ++i)
> -        out << "%" << this->getSrc(fn, i+2) << (i != (valueNum-1u) ? " " : "");
> +        out << "%" << this->getSrc(fn, i+1) << (i != (valueNum-1u) ? " " : "");
>        out << "}";
> -      out <<  (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
> +      AddressMode am = this->getAddressMode();
> +      out << " bti:";
> +      if ( am == AM_DynamicBti) {
> +        out << " %" << this->getBtiReg();
> +      } else {
> +        out << this->getSurfaceIndex();
> +      }
>      }
> 
>      INLINE void ReadARFInstruction::out(std::ostream &out, const Function
> &fn) const {
> @@ -1664,19 +1708,17 @@ DECL_MEM_FN(BitCastInstruction, Type,
> getSrcType(void), getSrcType())
>  DECL_MEM_FN(BitCastInstruction, Type, getDstType(void), getDstType())
>  DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
>  DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
> -DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void),
> getAddressSpace())
> +DECL_MEM_FN(MemInstruction, AddressSpace, getAddressSpace(void),
> getAddressSpace())
> +DECL_MEM_FN(MemInstruction, AddressMode, getAddressMode(void),
> getAddressMode())
> +DECL_MEM_FN(MemInstruction, Register, getAddressRegister(void),
> getAddressRegister())
> +DECL_MEM_FN(MemInstruction, Register, getBtiReg(void), getBtiReg())
> +DECL_MEM_FN(MemInstruction, unsigned, getSurfaceIndex(void),
> getSurfaceIndex())
> +DECL_MEM_FN(MemInstruction, Type,     getValueType(void),
> getValueType())
> +DECL_MEM_FN(MemInstruction, bool,     isAligned(void), isAligned())
> +DECL_MEM_FN(MemInstruction, unsigned, getAddressIndex(void),
> getAddressIndex())
>  DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void),
> getAtomicOpcode())
> -DECL_MEM_FN(AtomicInstruction, bool, isFixedBTI(void), isFixedBTI())
> -DECL_MEM_FN(StoreInstruction, Type, getValueType(void),
> getValueType())
>  DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void),
> getValueNum())
> -DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void),
> getAddressSpace())
> -DECL_MEM_FN(StoreInstruction, bool, isAligned(void), isAligned())
> -DECL_MEM_FN(StoreInstruction, bool, isFixedBTI(void), isFixedBTI())
> -DECL_MEM_FN(LoadInstruction, Type, getValueType(void), getValueType())
>  DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void),
> getValueNum())
> -DECL_MEM_FN(LoadInstruction, AddressSpace, getAddressSpace(void),
> getAddressSpace())
> -DECL_MEM_FN(LoadInstruction, bool, isAligned(void), isAligned())
> -DECL_MEM_FN(LoadInstruction, bool, isFixedBTI(void), isFixedBTI())
>  DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
>  DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void),
> getLabelIndex())
>  DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated())
> @@ -1702,6 +1744,15 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t,
> getImageIndex(void), getImageIndex
> 
>  #undef DECL_MEM_FN
> 
> +#define DECL_MEM_FN(CLASS, RET, PROTOTYPE, CALL) \
> +  RET CLASS::PROTOTYPE { \
> +    return reinterpret_cast<internal::CLASS*>(this)->CALL; \
> +  }
> +DECL_MEM_FN(MemInstruction, void,     setSurfaceIndex(unsigned id),
> setSurfaceIndex(id))
> +DECL_MEM_FN(MemInstruction, void,     setBtiReg(Register reg),
> setBtiReg(reg))
> +
> +#undef DECL_MEM_FN
> +
>    Immediate LoadImmInstruction::getImmediate(void) const {
>      const Function &fn = this->getFunction();
>      return reinterpret_cast<const internal::LoadImmInstruction*>(this)-
> >getImmediate(fn);
> @@ -1843,8 +1894,16 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t,
> getImageIndex(void), getImageIndex
>    }
> 
>    // For all unary functions with given opcode
> -  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace
> space, Register bti, bool fixedBTI, Tuple src) {
> -    return internal::AtomicInstruction(atomicOp, dst, space, bti, fixedBTI,
> src).convert();
> +  Instruction ATOMIC(AtomicOps atomicOp, Type type, Register dst,
> AddressSpace space, Register address, Tuple payload, AddressMode AM,
> Register bti) {
> +    internal::AtomicInstruction insn = internal::AtomicInstruction(atomicOp,
> type, dst, space, address, payload, AM);
> +    insn.setBtiReg(bti);
> +    return insn.convert();
> +  }
> +
> +  Instruction ATOMIC(AtomicOps atomicOp, Type type, Register dst,
> AddressSpace space, Register address, Tuple payload, AddressMode AM,
> unsigned SurfaceIndex) {
> +    internal::AtomicInstruction insn = internal::AtomicInstruction(atomicOp,
> type, dst, space, address, payload, AM);
> +    insn.setSurfaceIndex(SurfaceIndex);
> +    return insn.convert();
>    }
> 
>    // BRA
> @@ -1892,10 +1951,25 @@ DECL_MEM_FN(GetImageInfoInstruction,
> uint8_t, getImageIndex(void), getImageIndex
>                     AddressSpace space, \
>                     uint32_t valueNum, \
>                     bool dwAligned, \
> -                   bool fixedBTI, \
> +                   AddressMode AM, \
> +                   unsigned SurfaceIndex) \
> +  { \
> +    internal::CLASS insn =
> internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM); \
> +    insn.setSurfaceIndex(SurfaceIndex);\
> +    return insn.convert(); \
> +  } \
> +  Instruction NAME(Type type, \
> +                   Tuple tuple, \
> +                   Register offset, \
> +                   AddressSpace space, \
> +                   uint32_t valueNum, \
> +                   bool dwAligned, \
> +                   AddressMode AM, \
>                     Register bti) \
>    { \
> -    return
> internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,fixedBTI,bti).c
> onvert(); \
> +    internal::CLASS insn =
> internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM); \
> +    insn.setBtiReg(bti); \
> +    return insn.convert(); \
>    }
> 
>    DECL_EMIT_FUNCTION(LOAD, LoadInstruction)
> diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
> index 3f3c655..b8f95ba 100644
> --- a/backend/src/ir/instruction.hpp
> +++ b/backend/src/ir/instruction.hpp
> @@ -65,6 +65,13 @@ namespace ir {
>      MEM_INVALID
>    };
> 
> +  enum AddressMode : uint8_t {
> +    AM_DynamicBti = 0,
> +    AM_Stateless,
> +    AM_StaticBti,
> +    AM_INVALID
> +  };
> +
>    enum AtomicOps {
>      ATOMIC_OP_AND       = 1,
>      ATOMIC_OP_OR        = 2,
> @@ -288,20 +295,30 @@ namespace ir {
>      static bool isClassOf(const Instruction &insn);
>    };
> 
> +  class MemInstruction : public Instruction {
> +  public:
> +    unsigned getSurfaceIndex() const;
> +    unsigned getAddressIndex() const;
> +    /*! Address space that is manipulated here */
> +    AddressMode getAddressMode() const;
> +    Register getBtiReg() const;
> +    /*! Return the register that contains the addresses */
> +    Register getAddressRegister() const;
> +    AddressSpace getAddressSpace() const;
> +    /*! Return the types of the values */
> +    Type getValueType() const;
> +    bool isAligned(void) const;
> +    void setBtiReg(Register reg);
> +    void setSurfaceIndex(unsigned idx);
> +  };
> +
>    /*! Atomic instruction */
> -  class AtomicInstruction : public Instruction {
> +  class AtomicInstruction : public MemInstruction {
>    public:
>      /*! Where the address register goes */
> -    static const uint32_t btiIndex = 0;
> -    static const uint32_t addressIndex = 1;
> -    /*! Address space that is manipulated here */
> -    AddressSpace getAddressSpace(void) const;
> -    Register getBTI(void) const { return this->getSrc(btiIndex); }
> -    bool isFixedBTI(void) const;
> +    static const uint32_t addressIndex = 0;
>      /*! Return the atomic function code */
>      AtomicOps getAtomicOpcode(void) const;
> -    /*! Return the register that contains the addresses */
> -    INLINE Register getAddress(void) const { return this-
> >getSrc(addressIndex); }
>      /*! Return true if the given instruction is an instance of this class */
>      static bool isClassOf(const Instruction &insn);
>    };
> @@ -309,27 +326,15 @@ namespace ir {
>    /*! Store instruction. First source is the address. Next sources are the
>     *  values to store contiguously at the given address
>     */
> -  class StoreInstruction : public Instruction {
> +  class StoreInstruction : public MemInstruction {
>    public:
>      /*! Where the address register goes */
> -    static const uint32_t btiIndex = 0;
> -    static const uint32_t addressIndex = 1;
> -    /*! Return the types of the values to store */
> -    Type getValueType(void) const;
> -    /*! Give the number of values the instruction is storing (srcNum-1) */
> +    static const uint32_t addressIndex = 0;
>      uint32_t getValueNum(void) const;
> -    Register getBTI(void) const { return this->getSrc(btiIndex); }
> -    bool isFixedBTI(void) const;
> -    /*! Address space that is manipulated here */
> -    AddressSpace getAddressSpace(void) const;
> -    /*! DWORD aligned means untyped read for Gen. That is what matters */
> -    bool isAligned(void) const;
> -    /*! Return the register that contains the addresses */
> -    INLINE Register getAddress(void) const { return this-
> >getSrc(addressIndex); }
>      /*! Return the register that contain value valueID */
>      INLINE Register getValue(uint32_t valueID) const {
>        GBE_ASSERT(valueID < this->getValueNum());
> -      return this->getSrc(valueID + 2u);
> +      return this->getSrc(valueID + 1u);
>      }
>      /*! Return true if the given instruction is an instance of this class */
>      static bool isClassOf(const Instruction &insn);
> @@ -339,20 +344,10 @@ namespace ir {
>     *  The multiple destinations are the contiguous values loaded at the given
>     *  address
>     */
> -  class LoadInstruction : public Instruction {
> +  class LoadInstruction : public MemInstruction {
>    public:
> -    /*! Type of the loaded values (ie type of all the destinations) */
> -    Type getValueType(void) const;
>      /*! Number of values loaded (ie number of destinations) */
>      uint32_t getValueNum(void) const;
> -    /*! Address space that is manipulated here */
> -    AddressSpace getAddressSpace(void) const;
> -    /*! DWORD aligned means untyped read for Gen. That is what matters */
> -    bool isAligned(void) const;
> -    /*! Return the register that contains the addresses */
> -    INLINE Register getAddress(void) const { return this->getSrc(1u); }
> -    Register getBTI(void) const {return this->getSrc(0u);}
> -    bool isFixedBTI(void) const;
>      /*! Return the register that contain value valueID */
>      INLINE Register getValue(uint32_t valueID) const {
>        return this->getDst(valueID);
> @@ -725,7 +720,8 @@ namespace ir {
>    /*! F32TO16.{dstType <- srcType} dst src */
>    Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src);
>    /*! atomic dst addr.space {src1 {src2}} */
> -  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space,
> Register bti, bool fixedBTI, Tuple src);
> +  Instruction ATOMIC(AtomicOps opcode, Type, Register dst, AddressSpace
> space, Register ptr, Tuple payload, AddressMode, unsigned);
> +  Instruction ATOMIC(AtomicOps opcode, Type, Register dst, AddressSpace
> space, Register ptr, Tuple src, AddressMode, Register);
>    /*! bra labelIndex */
>    Instruction BRA(LabelIndex labelIndex);
>    /*! (pred) bra labelIndex */
> @@ -740,10 +736,12 @@ namespace ir {
>    Instruction WHILE(LabelIndex labelIndex, Register pred);
>    /*! ret */
>    Instruction RET(void);
> -  /*! load.type.space {dst1,...,dst_valueNum} offset value */
> -  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, bool fixedBTI, Register bti);
> -  /*! store.type.space offset {src1,...,src_valueNum} value */
> -  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, bool fixedBTI, Register bti);
> +  /*! load.type.space {dst1,...,dst_valueNum} offset value, {bti} */
> +  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned
> SurfaceIndex);
> +  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti);
> +  /*! store.type.space offset {src1,...,src_valueNum} value {bti}*/
> +  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned
> SurfaceIndex);
> +  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti);
>    /*! loadi.type dst value */
>    Instruction LOADI(Type type, Register dst, ImmediateIndex value);
>    /*! sync.params... (see Sync instruction) */
> diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
> index 9fcdf74..66ced8c 100644
> --- a/backend/src/ir/lowering.cpp
> +++ b/backend/src/ir/lowering.cpp
> @@ -320,7 +320,7 @@ namespace ir {
>              continue;
> 
>            IndirectLoad indirectLoad;
> -          Register addr = load->getAddress();
> +          Register addr = load->getAddressRegister();
>            indirectLoad.argID = argID;
>            indirectLoad.load = insn;
> 
> @@ -368,7 +368,7 @@ namespace ir {
> 
>          const Register reg = load->getValue(valueID);
> 
> -        Instruction mov = ir::INDIRECT_MOV(type, reg, arg, load->getAddress(),
> offset);
> +        Instruction mov = ir::INDIRECT_MOV(type, reg, arg, load-
> >getAddressRegister(), offset);
>          mov.insert(ins_after, &ins_after);
>          replaced = true;
>        }
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> b/backend/src/llvm/llvm_gen_backend.cpp
> index 7299d53..39665b8 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -386,6 +386,36 @@ namespace gbe
>      ir::Context &ctx;
>    };
> 
> +  class GenWriter;
> +  class MemoryInstHelper {
> +    public:
> +      MemoryInstHelper(ir::Context &c, ir::Unit &u, GenWriter *w, bool l)
> +                : ctx(c),
> +                  unit(u),
> +                  writer(w),
> +                  legacyMode(l)
> +                  { }
> +      void         emitUnalignedDQLoadStore(Value *llvmValues);
> +      ir::Tuple    getValueTuple(llvm::Value *llvmValues, llvm::Type *elemType,
> unsigned start, unsigned elemNum);
> +      void         emitBatchLoadOrStore(const ir::Type type, const uint32_t
> elemNum, Value *llvmValues, Type * elemType);
> +      ir::Register getOffsetAddress(ir::Register basePtr, unsigned offset);
> +      void         shootMessage(ir::Type type, ir::Register offset, ir::Tuple value,
> unsigned elemNum);
> +      template <bool isLoad, typename T>
> +      void         emitLoadOrStore(T &I);
> +    private:
> +      ir::Context             &ctx;
> +      ir::Unit               &unit;
> +      GenWriter            *writer;
> +      bool              legacyMode;
> +      ir::AddressSpace   addrSpace;
> +      ir::Register            mBTI;
> +      ir::Register            mPtr;
> +      ir::AddressMode mAddressMode;
> +      unsigned        SurfaceIndex;
> +      bool                  isLoad;
> +      bool               dwAligned;
> +  };
> +
>    /*! Translate LLVM IR code to Gen IR code */
>    class GenWriter : public FunctionPass, public InstVisitor<GenWriter>
>    {
> @@ -437,6 +467,9 @@ namespace gbe
>      Function *Func;
>      const Module *TheModule;
>      int btiBase;
> +    /*! legacyMode is for hardware before BDW,
> +     * which do not support stateless memory access */
> +    bool legacyMode;
>    public:
>      static char ID;
>      explicit GenWriter(ir::Unit &unit)
> @@ -446,7 +479,8 @@ namespace gbe
>          regTranslator(ctx),
>          LI(0),
>          TheModule(0),
> -        btiBase(BTI_RESERVED_NUM)
> +        btiBase(BTI_RESERVED_NUM),
> +        legacyMode(true)
>      {
>  #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
>        initializeLoopInfoWrapperPassPass(*PassRegistry::getPassRegistry());
> @@ -491,7 +525,8 @@ namespace gbe
> 
>        Func = &F;
>        assignBti(F);
> -      analyzePointerOrigin(F);
> +      if (legacyMode)
> +        analyzePointerOrigin(F);
> 
>  #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
>        LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
> @@ -643,6 +678,7 @@ namespace gbe
>        ir::ImmediateIndex processSeqConstant(ConstantDataSequential *seq,
>                                              int index, ConstTypeId tid);
>        ir::ImmediateIndex processConstantVector(ConstantVector *cv, int
> index);
> +      friend class MemoryInstHelper;
>    };
> 
>    char GenWriter::ID = 0;
> @@ -3570,47 +3606,55 @@ namespace gbe
>      CallSite::arg_iterator AI = CS.arg_begin();
>      CallSite::arg_iterator AE = CS.arg_end();
>      GBE_ASSERT(AI != AE);
> -
> -    ir::AddressSpace addrSpace;
> -
>      Value *llvmPtr = *AI;
> -    Value *bti = getBtiRegister(llvmPtr);
> -    Value *ptrBase = getPointerBase(llvmPtr);
> +    ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr-
> >getType()->getPointerAddressSpace());
>      ir::Register pointer = this->getRegister(llvmPtr);
> -    ir::Register baseReg = this->getRegister(ptrBase);
> 
> +    ir::Register ptr;
>      ir::Register btiReg;
> -    bool fixedBTI = false;
> -    if (isa<ConstantInt>(bti)) {
> -      fixedBTI = true;
> -      unsigned index = cast<ConstantInt>(bti)->getZExtValue();
> -      addrSpace = btiToGen(index);
> -      ir::ImmediateIndex immIndex = ctx.newImmediate((uint32_t)index);
> -      btiReg = ctx.reg(ir::FAMILY_DWORD);
> -      ctx.LOADI(ir::TYPE_U32, btiReg, immIndex);
> +    unsigned SurfaceIndex = 0xff;;
> +
> +    ir::AddressMode AM;
> +    if (legacyMode) {
> +      Value *bti = getBtiRegister(llvmPtr);
> +      Value *ptrBase = getPointerBase(llvmPtr);
> +      ir::Register baseReg = this->getRegister(ptrBase);
> +      if (isa<ConstantInt>(bti)) {
> +        AM = ir::AM_StaticBti;
> +        SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
> +        addrSpace = btiToGen(SurfaceIndex);
> +      } else {
> +        AM = ir::AM_DynamicBti;
> +        addrSpace = ir::MEM_MIXED;
> +        btiReg = this->getRegister(bti);
> +      }
> +      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
> +      ptr = ctx.reg(pointerFamily);
> +      ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
>      } else {
> -      addrSpace = ir::MEM_MIXED;
> -      btiReg = this->getRegister(bti);
> +      AM = ir::AM_Stateless;
> +      ptr = pointer;
>      }
> 
> -    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
> -    const ir::Register ptr = ctx.reg(pointerFamily);
> -    ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
> -
>      const ir::Register dst = this->getRegister(&I);
> 
> -    uint32_t srcNum = 0;
> -    vector<ir::Register> src;
> -    src.push_back(ptr);
> -    srcNum++;
> +    uint32_t payloadNum = 0;
> +    vector<ir::Register> payload;
>      AI++;
> 
>      while(AI != AE) {
> -      src.push_back(this->getRegister(*(AI++)));
> -      srcNum++;
> +      payload.push_back(this->getRegister(*(AI++)));
> +      payloadNum++;
> +    }
> +    ir::Type type = getType(ctx, llvmPtr->getType()-
> >getPointerElementType());
> +    const ir::Tuple payloadTuple = payloadNum == 0 ?
> +                                   ir::Tuple(0) :
> +                                   ctx.arrayTuple(&payload[0], payloadNum);
> +    if (AM == ir::AM_DynamicBti) {
> +      ctx.ATOMIC(opcode, type, dst, addrSpace, ptr, payloadTuple, AM,
> btiReg);
> +    } else {
> +      ctx.ATOMIC(opcode, type, dst, addrSpace, ptr, payloadTuple, AM,
> SurfaceIndex);
>      }
> -    const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
> -    ctx.ATOMIC(opcode, dst, addrSpace, btiReg, fixedBTI, srcTuple);
>    }
> 
>    /* append a new sampler. should be called before any reference to
> @@ -4323,65 +4367,82 @@ namespace gbe
>      this->newRegister(&I);
>    }
>    void GenWriter::regAllocateStoreInst(StoreInst &I) {}
> +  void GenWriter::emitLoadInst(LoadInst &I) {
> +    MemoryInstHelper *h = new MemoryInstHelper(ctx, unit, this,
> legacyMode);
> +    h->emitLoadOrStore<true>(I);
> +    delete h;
> +  }
> 
> -  void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t
> elemNum,
> -                                      Value *llvmValues, const ir::Register ptr,
> -                                      const ir::AddressSpace addrSpace,
> -                                      Type * elemType, bool isLoad, ir::Register bti,
> -                                      bool dwAligned, bool fixedBTI) {
> -    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
> -    uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
> -    uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
> -    const uint32_t perMsgNum = elemNum / msgNum;
> +  void GenWriter::emitStoreInst(StoreInst &I) {
> +    MemoryInstHelper *h = new MemoryInstHelper(ctx, unit, this,
> legacyMode);
> +    h->emitLoadOrStore<false>(I);
> +    delete h;
> +  }
> 
> -    for (uint32_t msg = 0; msg < msgNum; ++msg) {
> -      // Build the tuple data in the vector
> +  llvm::FunctionPass *createGenPass(ir::Unit &unit) {
> +    return new GenWriter(unit);
> +  }
> +
> +  ir::Tuple MemoryInstHelper::getValueTuple(llvm::Value *llvmValues,
> llvm::Type *elemType, unsigned start, unsigned elemNum) {
>        vector<ir::Register> tupleData; // put registers here
> -      for (uint32_t elemID = 0; elemID < perMsgNum; ++elemID) {
> +      for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
>          ir::Register reg;
> -        if(regTranslator.isUndefConst(llvmValues, elemID)) {
> +        if(writer->regTranslator.isUndefConst(llvmValues, elemID)) {
>            Value *v = Constant::getNullValue(elemType);
> -          reg = this->getRegister(v);
> +          reg = writer->getRegister(v);
>          } else
> -          reg = this->getRegister(llvmValues, perMsgNum*msg+elemID);
> +          reg = writer->getRegister(llvmValues, start + elemID);
> 
>          tupleData.push_back(reg);
>        }
> -      const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], perMsgNum);
> -
> -      // We may need to update to offset the pointer
> -      ir::Register addr;
> -      if (msg == 0)
> -        addr = ptr;
> -      else {
> -        const ir::Register offset = ctx.reg(pointerFamily);
> -        ir::ImmediateIndex immIndex;
> -        ir::Type immType;
> +      const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
> +      return tuple;
> +  }
> +
> +  void MemoryInstHelper::emitBatchLoadOrStore(const ir::Type type, const
> uint32_t elemNum,
> +                                      Value *llvmValues,
> +                                      Type * elemType) {
> +    uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
> +    uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
> +    const uint32_t perMsgNum = elemNum / msgNum;
> +
> +    for (uint32_t msg = 0; msg < msgNum; ++msg) {
> +      // Build the tuple data in the vector
> +     ir::Tuple tuple = getValueTuple(llvmValues, elemType, perMsgNum*msg,
> perMsgNum);
>          // each message can read/write 16 byte
>          const int32_t stride = 16;
> -        if (pointerFamily == ir::FAMILY_DWORD) {
> -          immIndex = ctx.newImmediate(int32_t(msg*stride));
> -          immType = ir::TYPE_S32;
> -        } else {
> -          immIndex = ctx.newImmediate(int64_t(msg*stride));
> -          immType = ir::TYPE_S64;
> -        }
> +      ir::Register addr = getOffsetAddress(mPtr, msg*stride);
> +      shootMessage(type, addr, tuple, perMsgNum);
> +    }
> +  }
> +
> +  ir::Register MemoryInstHelper::getOffsetAddress(ir::Register basePtr,
> unsigned offset) {
> +    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
> +    ir::Register addr;
> +    if (offset == 0)
> +      addr = basePtr;
> +    else {
> +      const ir::Register offsetReg = ctx.reg(pointerFamily);
> +      ir::ImmediateIndex immIndex;
> +      ir::Type immType;
> 
> -        addr = ctx.reg(pointerFamily);
> -        ctx.LOADI(immType, offset, immIndex);
> -        ctx.ADD(immType, addr, ptr, offset);
> +      if (pointerFamily == ir::FAMILY_DWORD) {
> +        immIndex = ctx.newImmediate(int32_t(offset));
> +        immType = ir::TYPE_S32;
> +      } else {
> +        immIndex = ctx.newImmediate(int64_t(offset));
> +        immType = ir::TYPE_S64;
>        }
> 
> -      // Emit the instruction
> -      if (isLoad)
> -        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned,
> fixedBTI, bti);
> -      else
> -        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned,
> fixedBTI, bti);
> +      addr = ctx.reg(pointerFamily);
> +      ctx.LOADI(immType, offsetReg, immIndex);
> +      ctx.ADD(immType, addr, basePtr, offsetReg);
>      }
> +    return addr;
>    }
> 
>    // handle load of dword/qword with unaligned address
> -  void GenWriter::emitUnalignedDQLoadStore(ir::Register ptr, Value
> *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool
> dwAligned, bool fixedBTI)
> +  void MemoryInstHelper::emitUnalignedDQLoadStore(Value *llvmValues)
>    {
>      Type *llvmType = llvmValues->getType();
>      unsigned byteSize = getTypeByteSize(unit, llvmType);
> @@ -4395,19 +4456,7 @@ namespace gbe
>      }
>      const ir::Type type = getType(ctx, elemType);
> 
> -    vector<ir::Register> tupleData;
> -    for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
> -      ir::Register reg;
> -      if(regTranslator.isUndefConst(llvmValues, elemID)) {
> -        Value *v = Constant::getNullValue(elemType);
> -        reg = this->getRegister(v);
> -      } else
> -        reg = this->getRegister(llvmValues, elemID);
> -
> -      tupleData.push_back(reg);
> -    }
> -    const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
> -
> +    ir::Tuple tuple = getValueTuple(llvmValues, elemType, 0, elemNum);
>      vector<ir::Register> byteTupleData;
>      for (uint32_t elemID = 0; elemID < byteSize; ++elemID) {
>        byteTupleData.push_back(ctx.reg(ir::FAMILY_BYTE));
> @@ -4415,97 +4464,83 @@ namespace gbe
>      const ir::Tuple byteTuple = ctx.arrayTuple(&byteTupleData[0], byteSize);
> 
>      if (isLoad) {
> -      ctx.LOAD(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned,
> fixedBTI, bti);
> +      shootMessage(ir::TYPE_U8, mPtr, byteTuple, byteSize);
>        ctx.BITCAST(type, ir::TYPE_U8, tuple, byteTuple, elemNum, byteSize);
>      } else {
>        ctx.BITCAST(ir::TYPE_U8, type, byteTuple, tuple, byteSize, elemNum);
>        // FIXME: byte scatter does not handle correctly vector store, after fix
> that,
>        //        we can directly use on store instruction like:
>        //        ctx.STORE(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize,
> dwAligned, fixedBTI, bti);
> -      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
>        for (uint32_t elemID = 0; elemID < byteSize; elemID++) {
> -        const ir::Register reg = byteTupleData[elemID];
> -        ir::Register addr;
> -        if (elemID == 0)
> -          addr = ptr;
> -        else {
> -          const ir::Register offset = ctx.reg(pointerFamily);
> -          ir::ImmediateIndex immIndex;
> -          immIndex = ctx.newImmediate(int32_t(elemID));
> -          addr = ctx.reg(pointerFamily);
> -          ctx.LOADI(ir::TYPE_S32, offset, immIndex);
> -          ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
> -        }
> -       ctx.STORE(ir::TYPE_U8, addr, addrSpace, dwAligned, fixedBTI, bti, reg);
> +        const ir::Register addr = getOffsetAddress(mPtr, elemID);
> +        const ir::Tuple value = ctx.arrayTuple(&byteTupleData[elemID], 1);
> +        shootMessage(ir::TYPE_U8, addr, value, 1);
>        }
>      }
>    }
> 
> -  extern int OCL_SIMD_WIDTH;
>    template <bool isLoad, typename T>
> -  INLINE void GenWriter::emitLoadOrStore(T &I)
> -  {
> +  void MemoryInstHelper::emitLoadOrStore(T &I) {
>      Value *llvmPtr = I.getPointerOperand();
>      Value *llvmValues = getLoadOrStoreValue(I);
>      Type *llvmType = llvmValues->getType();
> -    const bool dwAligned = (I.getAlignment() % 4) == 0;
> -    ir::AddressSpace addrSpace;
> -    const ir::Register pointer = this->getRegister(llvmPtr);
> +    dwAligned = (I.getAlignment() % 4) == 0;
> +    addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()-
> >getPointerAddressSpace());
> +    const ir::Register pointer = writer->getRegister(llvmPtr);
>      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
> 
> -    Value *bti = getBtiRegister(llvmPtr);
> -    Value *ptrBase = getPointerBase(llvmPtr);
> -    ir::Register baseReg = this->getRegister(ptrBase);
> -    bool zeroBase = false;
> -    if (isa<ConstantPointerNull>(ptrBase)) {
> -      zeroBase = true;
> -    }
> -
> -    ir::Register btiReg;
> -    bool fixedBTI = false;
> -    if (isa<ConstantInt>(bti)) {
> -      fixedBTI = true;
> -      unsigned index = cast<ConstantInt>(bti)->getZExtValue();
> -      addrSpace = btiToGen(index);
> -      ir::ImmediateIndex immIndex = ctx.newImmediate((uint32_t)index);
> -      btiReg = ctx.reg(ir::FAMILY_DWORD);
> -      ctx.LOADI(ir::TYPE_U32, btiReg, immIndex);
> -    } else {
> -      addrSpace = ir::MEM_MIXED;
> -      btiReg = this->getRegister(bti);
> -    }
> -
> +    this->isLoad = isLoad;
>      Type *scalarType = llvmType;
>      if (!isScalarType(llvmType)) {
>        VectorType *vectorType = cast<VectorType>(llvmType);
>        scalarType = vectorType->getElementType();
>      }
> 
> -    ir::Register ptr = ctx.reg(pointerFamily);
> -    // FIXME: avoid subtraction zero at this stage is not a good idea,
> -    // but later ArgumentLower pass need to match exact load/addImm
> pattern
> -    // so, I avoid subtracting zero base to satisfy ArgumentLower pass.
> -    if (!zeroBase)
> -      ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
> -    else
> -      ptr = pointer;
> +    // calculate bti and pointer operand
> +    if (legacyMode) {
> +      Value *bti = writer->getBtiRegister(llvmPtr);
> +      Value *ptrBase = writer->getPointerBase(llvmPtr);
> +      ir::Register baseReg = writer->getRegister(ptrBase);
> +      bool zeroBase = isa<ConstantPointerNull>(ptrBase) ? true : false;
> +
> +      if (isa<ConstantInt>(bti)) {
> +        SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
> +        addrSpace = btiToGen(SurfaceIndex);
> +        mAddressMode = ir::AM_StaticBti;
> +      } else {
> +        addrSpace = ir::MEM_MIXED;
> +        mBTI = writer->getRegister(bti);
> +        mAddressMode = ir::AM_DynamicBti;
> +      }
> +      mPtr = ctx.reg(pointerFamily);
> +
> +      // FIXME: avoid subtraction zero at this stage is not a good idea,
> +      // but later ArgumentLower pass need to match exact load/addImm
> pattern
> +      // so, I avoid subtracting zero base to satisfy ArgumentLower pass.
> +      if (!zeroBase)
> +        ctx.SUB(ir::TYPE_U32, mPtr, pointer, baseReg);
> +      else
> +        mPtr = pointer;
> +    } else {
> +      mPtr = pointer;
> +      SurfaceIndex = 0xff;
> +      mAddressMode = ir::AM_Stateless;
> +    }
> 
>      unsigned primitiveBits = scalarType->getPrimitiveSizeInBits();
>      if (!dwAligned
>         && (primitiveBits == 64
>            || primitiveBits == 32)
>         ) {
> -      emitUnalignedDQLoadStore(ptr, llvmValues, addrSpace, btiReg, isLoad,
> dwAligned, fixedBTI);
> +      emitUnalignedDQLoadStore(llvmValues);
>        return;
>      }
>      // Scalar is easy. We neednot build register tuples
>      if (isScalarType(llvmType) == true) {
>        const ir::Type type = getType(ctx, llvmType);
> -      const ir::Register values = this->getRegister(llvmValues);
> -      if (isLoad)
> -        ctx.LOAD(type, ptr, addrSpace, dwAligned, fixedBTI, btiReg, values);
> -      else
> -        ctx.STORE(type, ptr, addrSpace, dwAligned, fixedBTI, btiReg, values);
> +      const ir::Register values = writer->getRegister(llvmValues);
> +      const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
> +      shootMessage(type, mPtr, tuple, 1);
>      }
>      // A vector type requires to build a tuple
>      else {
> @@ -4521,7 +4556,7 @@ namespace gbe
>        // And the llvm does cast a type3 data to type4 for load/store instruction,
>        // so a 4 elements vector may only have 3 valid elements. We need to fix
> it to correct element
>        // count here.
> -      if (elemNum == 4 && regTranslator.isUndefConst(llvmValues, 3))
> +      if (elemNum == 4 && writer->regTranslator.isUndefConst(llvmValues, 3))
>            elemNum = 3;
> 
>        // The code is going to be fairly different from types to types (based on
> @@ -4532,72 +4567,44 @@ namespace gbe
>        if(dataFamily == ir::FAMILY_DWORD && addrSpace !=
> ir::MEM_CONSTANT) {
>          // One message is enough here. Nothing special to do
>          if (elemNum <= 4) {
> -          // Build the tuple data in the vector
> -          vector<ir::Register> tupleData; // put registers here
> -          for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
> -            ir::Register reg;
> -            if(regTranslator.isUndefConst(llvmValues, elemID)) {
> -              Value *v = Constant::getNullValue(elemType);
> -              reg = this->getRegister(v);
> -            } else
> -              reg = this->getRegister(llvmValues, elemID);
> -
> -            tupleData.push_back(reg);
> -          }
> -          const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
> -
> -          // Emit the instruction
> -          if (isLoad)
> -            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, fixedBTI,
> btiReg);
> -          else
> -            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, fixedBTI,
> btiReg);
> +          ir::Tuple tuple = getValueTuple(llvmValues, elemType, 0, elemNum);
> +          shootMessage(type, mPtr, tuple, elemNum);
>          }
> -        // Not supported by the hardware. So, we split the message and we use
> -        // strided loads and stores
>          else {
> -          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace,
> elemType, isLoad, btiReg, dwAligned, fixedBTI);
> +          emitBatchLoadOrStore(type, elemNum, llvmValues, elemType);
>          }
>        }
>        else if((dataFamily == ir::FAMILY_WORD && (isLoad || elemNum % 2 ==
> 0)) ||
>                (dataFamily == ir::FAMILY_BYTE && (isLoad || elemNum % 4 == 0))) {
> -          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace,
> elemType, isLoad, btiReg, dwAligned, fixedBTI);
> +          emitBatchLoadOrStore(type, elemNum, llvmValues, elemType);
>        } else {
>          for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
> -          if(regTranslator.isUndefConst(llvmValues, elemID))
> +          if(writer->regTranslator.isUndefConst(llvmValues, elemID))
>              continue;
> 
> -          const ir::Register reg = this->getRegister(llvmValues, elemID);
> -          ir::Register addr;
> -          if (elemID == 0)
> -            addr = ptr;
> -          else {
> -              const ir::Register offset = ctx.reg(pointerFamily);
> -              ir::ImmediateIndex immIndex;
> -              int elemSize = getTypeByteSize(unit, elemType);
> -              immIndex = ctx.newImmediate(int32_t(elemID * elemSize));
> -              addr = ctx.reg(pointerFamily);
> -              ctx.LOADI(ir::TYPE_S32, offset, immIndex);
> -              ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
> -          }
> -          if (isLoad)
> -           ctx.LOAD(type, addr, addrSpace, dwAligned, fixedBTI, btiReg, reg);
> -          else
> -           ctx.STORE(type, addr, addrSpace, dwAligned, fixedBTI, btiReg, reg);
> +          const ir::Register reg = writer->getRegister(llvmValues, elemID);
> +          int elemSize = getTypeByteSize(unit, elemType);
> +
> +          ir::Register addr = getOffsetAddress(mPtr, elemID*elemSize);
> +          const ir::Tuple tuple = ctx.arrayTuple(&reg, 1);
> +          shootMessage(type, addr, tuple, 1);
>          }
>        }
>      }
>    }
> 
> -  void GenWriter::emitLoadInst(LoadInst &I) {
> -    this->emitLoadOrStore<true>(I);
> -  }
> -
> -  void GenWriter::emitStoreInst(StoreInst &I) {
> -    this->emitLoadOrStore<false>(I);
> -  }
> -
> -  llvm::FunctionPass *createGenPass(ir::Unit &unit) {
> -    return new GenWriter(unit);
> +  void MemoryInstHelper::shootMessage(ir::Type type, ir::Register offset,
> ir::Tuple value, unsigned elemNum) {
> +    if (mAddressMode == ir::AM_DynamicBti) {
> +      if (isLoad)
> +        ctx.LOAD(type, value, offset, addrSpace, elemNum, dwAligned,
> mAddressMode, mBTI);
> +      else
> +        ctx.STORE(type, value, offset, addrSpace, elemNum, dwAligned,
> mAddressMode, mBTI);
> +    } else {
> +      if (isLoad)
> +        ctx.LOAD(type, value, offset, addrSpace, elemNum, dwAligned,
> mAddressMode, SurfaceIndex);
> +      else
> +        ctx.STORE(type, value, offset, addrSpace, elemNum, dwAligned,
> mAddressMode, SurfaceIndex);
> +    }
>    }
>  } /* namespace gbe */
> 
> --
> 2.3.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list