[Beignet] [PATCH V2] GBE: Refine ir for memory operation like atomic/load/store

Ruiling Song ruiling.song at intel.com
Mon Oct 26 00:14:27 PDT 2015


the legacyMode means what kind of address mode to choose.
when legacyMode is true, we need to do complex bti analysis.

dynamicBti and staticBti are most for platforms before BDW.
And stateless is for platform BDW+

v2:
 only do analyzePointerOrigin() under legacyMode.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 132 ++++------
 backend/src/ir/context.hpp                 |  19 --
 backend/src/ir/instruction.cpp             | 410 +++++++++++++++++------------
 backend/src/ir/instruction.hpp             |  78 +++---
 backend/src/ir/lowering.cpp                |   4 +-
 backend/src/llvm/llvm_gen_backend.cpp      | 396 ++++++++++++++--------------
 6 files changed, 532 insertions(+), 507 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index a07c5dd..259c3cf 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -1249,11 +1249,11 @@ namespace gbe
   }
 
   void Selection::Opaque::ATOMIC(Reg dst, uint32_t function,
-                                 uint32_t srcNum, Reg src0,
+                                 uint32_t msgPayload, Reg src0,
                                  Reg src1, Reg src2, GenRegister bti,
                                  vector<GenRegister> temps) {
     unsigned dstNum = 1 + temps.size();
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, dstNum, srcNum + 1);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, dstNum, msgPayload + 1);
 
     if (bti.file != GEN_IMMEDIATE_VALUE) {
       insn->state.flag = 0;
@@ -1267,14 +1267,15 @@ namespace gbe
     }
 
     insn->src(0) = src0;
-    if(srcNum > 1) insn->src(1) = src1;
-    if(srcNum > 2) insn->src(2) = src2;
-    insn->src(srcNum) = bti;
+    if(msgPayload > 1) insn->src(1) = src1;
+    if(msgPayload > 2) insn->src(2) = src2;
+    insn->src(msgPayload) = bti;
+
     insn->extra.function = function;
-    insn->extra.elem = srcNum;
+    insn->extra.elem = msgPayload;
 
     SelectionVector *vector = this->appendVector();
-    vector->regNum = srcNum;
+    vector->regNum = msgPayload; //bti not included in SelectionVector
     vector->offsetID = 0;
     vector->reg = &insn->src(0);
     vector->isSrc = 1;
@@ -3370,8 +3371,6 @@ namespace gbe
                    uint32_t valueNum,
                    ir::BTI bti) const
     {
-        //GenRegister temp = getRelativeAddress(sel, addr, sel.selReg(bti.base, ir::TYPE_U32));
-
         GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
         sel.UNTYPED_READ(addr, dst.data(), valueNum, b, sel.getBTITemps(bti));
     }
@@ -3672,28 +3671,12 @@ namespace gbe
       return false;
     }
 
-    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::LoadInstruction &insn) const {
-      using namespace ir;
-      SelectionDAG *child0 = dag.child[0];
-      ir::BTI b;
-      if (insn.isFixedBTI()) {
-        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
-        const auto imm = immInsn.getImmediate();
-        b.isConst = 1;
-        b.imm = imm.getIntegerValue();
-      } else {
-        b.isConst = 0;
-        b.reg = insn.getBTI();
-      }
-      return b;
-    }
-
     /*! Implements base class */
     virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
     {
       using namespace ir;
       const ir::LoadInstruction &insn = cast<ir::LoadInstruction>(dag.insn);
-      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
+      GenRegister address = sel.selReg(insn.getAddressRegister(), ir::TYPE_U32);
       GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
                  insn.getAddressSpace() == MEM_CONSTANT ||
                  insn.getAddressSpace() == MEM_PRIVATE ||
@@ -3701,8 +3684,17 @@ namespace gbe
                  insn.getAddressSpace() == MEM_MIXED);
       //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
 
-      BTI bti = getBTI(dag, insn);
-
+      BTI bti;
+      AddressMode am = insn.getAddressMode();
+      if (am == AM_StaticBti) {
+        bti.isConst = 1;
+        bti.imm = insn.getSurfaceIndex();
+      } else if (am == AM_DynamicBti) {
+        bti.isConst = 0;
+        bti.reg = insn.getBtiReg();
+      } else {
+        assert(0 && "stateless not supported yet");
+      }
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(sel, type);
       bool allConstant = isAllConstant(bti);
@@ -3730,12 +3722,7 @@ namespace gbe
           this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
       }
 
-
-      // for fixed bti, don't generate the useless loadi
-      if (insn.isFixedBTI())
-        dag.child[0] = NULL;
       markAllChildren(dag);
-
       return true;
     }
   };
@@ -3839,32 +3826,26 @@ namespace gbe
       }
     }
 
-
-    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::StoreInstruction &insn) const {
-      using namespace ir;
-      SelectionDAG *child0 = dag.child[0];
-      ir::BTI b;
-      if (insn.isFixedBTI()) {
-        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
-        const auto imm = immInsn.getImmediate();
-        b.isConst = 1;
-        b.imm = imm.getIntegerValue();
-      } else {
-        b.isConst = 0;
-        b.reg = insn.getBTI();
-      }
-      return b;
-    }
     virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
     {
       using namespace ir;
       const ir::StoreInstruction &insn = cast<ir::StoreInstruction>(dag.insn);
-      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
+      GenRegister address = sel.selReg(insn.getAddressRegister(), ir::TYPE_U32);
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(sel, type);
 
-      const bool isUniform = sel.isScalarReg(insn.getAddress()) && sel.isScalarReg(insn.getValue(0));
-      BTI bti = getBTI(dag, insn);
+      const bool isUniform = sel.isScalarReg(insn.getAddressRegister()) && sel.isScalarReg(insn.getValue(0));
+      BTI bti;
+      AddressMode am = insn.getAddressMode();
+      if (am == AM_StaticBti) {
+        bti.isConst = 1;
+        bti.imm = insn.getSurfaceIndex();
+      } else if (am == AM_DynamicBti) {
+        bti.isConst = 0;
+        bti.reg = insn.getBtiReg();
+      } else {
+        assert(0 && "stateless not supported yet");
+      }
 
       if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
         this->emitWrite64(sel, insn, address, bti);
@@ -3874,11 +3855,7 @@ namespace gbe
         this->emitByteScatter(sel, insn, elemSize, address, bti, isUniform);
       }
 
-      // for fixed bti, don't generate the useless loadi
-      if (insn.isFixedBTI())
-        dag.child[0] = NULL;
       markAllChildren(dag);
-
       return true;
     }
   };
@@ -4741,47 +4718,36 @@ namespace gbe
           this->opcodes.push_back(ir::Opcode(op));
     }
 
-    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::AtomicInstruction &insn) const {
-      using namespace ir;
-      SelectionDAG *child0 = dag.child[0];
-      ir::BTI b;
-      if (insn.isFixedBTI()) {
-        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
-        const auto imm = immInsn.getImmediate();
-        b.isConst = 1;
-        b.imm = imm.getIntegerValue();
-      } else {
-        b.isConst = 0;
-        b.reg = insn.getBTI();
-      }
-      return b;
-    }
-
     INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
       using namespace ir;
       const ir::AtomicInstruction &insn = cast<ir::AtomicInstruction>(dag.insn);
 
-      ir::BTI b = getBTI(dag, insn);
+      ir::BTI b;
       const AtomicOps atomicOp = insn.getAtomicOpcode();
       unsigned srcNum = insn.getSrcNum();
-      unsigned opNum = srcNum - 1;
+      unsigned msgPayload;
+
+      AddressMode AM = insn.getAddressMode();
+      if (AM == AM_DynamicBti) {
+        b.reg = insn.getBtiReg();
+        msgPayload = srcNum - 1;
+      } else {
+        b.imm = insn.getSurfaceIndex();
+        b.isConst = 1;
+        msgPayload = srcNum;
+      }
 
       GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
       GenRegister bti =  b.isConst ? GenRegister::immud(b.imm) : sel.selReg(b.reg, ir::TYPE_U32);
-      GenRegister src0 = sel.selReg(insn.getSrc(1), TYPE_U32);   //address
+      GenRegister src0 = sel.selReg(insn.getAddressRegister(), TYPE_U32);
       GenRegister src1 = src0, src2 = src0;
-      if(srcNum > 2) src1 = sel.selReg(insn.getSrc(2), TYPE_U32);
-      if(srcNum > 3) src2 = sel.selReg(insn.getSrc(3), TYPE_U32);
+      if(msgPayload > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
+      if(msgPayload > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
 
       GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
+      sel.ATOMIC(dst, genAtomicOp, msgPayload, src0, src1, src2, bti, sel.getBTITemps(b));
 
-      sel.ATOMIC(dst, genAtomicOp, opNum, src0, src1, src2, bti, sel.getBTITemps(b));
-
-      // for fixed bti, don't generate the useless loadi
-      if (insn.isFixedBTI())
-        dag.child[0] = NULL;
       markAllChildren(dag);
-
       return true;
     }
   };
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index 0f7ded4..ab0d8b5 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -189,25 +189,6 @@ namespace ir {
       this->append(insn);
     }
 
-    /*! LOAD with the destinations directly specified */
-    template <typename... Args>
-    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, bool fixedBTI, Register bti, Args...values)
-    {
-      const Tuple index = this->tuple(values...);
-      const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
-      GBE_ASSERT(valueNum > 0);
-      this->LOAD(type, index, offset, space, valueNum, dwAligned, fixedBTI, bti);
-    }
-
-    /*! STORE with the sources directly specified */
-    template <typename... Args>
-    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, bool fixedBTI, Register bti, Args...values)
-    {
-      const Tuple index = this->tuple(values...);
-      const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
-      GBE_ASSERT(valueNum > 0);
-      this->STORE(type, index, offset, space, valueNum, dwAligned, fixedBTI, bti);
-    }
     void appendSurface(uint8_t bti, Register reg) { fn->appendSurface(bti, reg); }
 
   protected:
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index f93c528..0246920 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -316,60 +316,104 @@ namespace ir {
       Type srcType; //!< Type to convert from
     };
 
+    class ALIGNED_INSTRUCTION MemInstruction :
+      public BasePolicy
+    {
+    public:
+      MemInstruction(AddressMode   _AM,
+                     AddressSpace _AS,
+                     bool _dwAligned,
+                     Type _type,
+                     Register _offset)
+                   : AM(_AM),
+                     AS(_AS),
+                     dwAligned(_dwAligned),
+                     type(_type),
+                     SurfaceIndex(0),
+                     offset(_offset) {
+      }
+      AddressMode  getAddressMode()    const { return AM; }
+      AddressSpace getAddressSpace()   const { return AS; }
+      /*! MemInstruction may have one possible btiReg */
+      Register     getBtiReg()         const { assert(AM == AM_DynamicBti); return BtiReg; }
+      unsigned     getSurfaceIndex()   const { assert(AM != AM_DynamicBti); return SurfaceIndex; }
+      Register     getAddressRegister()const { return offset; }
+      unsigned     getAddressIndex()   const { return 0; }
+      Type         getValueType()      const { return type; }
+      INLINE bool  isAligned(void)     const { return !!dwAligned; }
+
+      void         setSurfaceIndex (unsigned id)  { SurfaceIndex = id; }
+      void         setBtiReg(Register reg)        { BtiReg = reg;      }
+    protected:
+      /*! including address reg + optional bti reg */
+      int          getBaseSrcNum()    const { return AM == AM_DynamicBti ? 2 : 1; }
+      bool         hasExtraBtiReg()   const { return AM == AM_DynamicBti; }
+      AddressMode       AM;
+      AddressSpace      AS;
+      uint8_t           dwAligned : 1;
+      Type              type;
+      union {
+        Register        BtiReg;
+        unsigned        SurfaceIndex;
+      };
+      Register          offset;
+    };
+
     class ALIGNED_INSTRUCTION AtomicInstruction :
-      public BasePolicy,
+      public MemInstruction,
       public NDstPolicy<AtomicInstruction, 1>
     {
     public:
       AtomicInstruction(AtomicOps atomicOp,
+                         Type type,
                          Register dst,
                          AddressSpace addrSpace,
-                         Register bti,
-                         bool fixedBTI,
-                         Tuple src)
+                         Register address,
+                         Tuple payload,
+                         AddressMode AM)
+        : MemInstruction(AM, addrSpace, true, type, address)
       {
         this->opcode = OP_ATOMIC;
         this->atomicOp = atomicOp;
         this->dst[0] = dst;
-        this->src = src;
-        this->addrSpace = addrSpace;
-        this->bti = bti;
-        this->fixedBTI = fixedBTI ? 1: 0;
-        srcNum = 2;
+        this->payload = payload;
+
+        int payloadNum = 1;
         if((atomicOp == ATOMIC_OP_INC) ||
           (atomicOp == ATOMIC_OP_DEC))
-          srcNum = 1;
+          payloadNum = 0;
         if(atomicOp == ATOMIC_OP_CMPXCHG)
-          srcNum = 3;
-        srcNum++;
+          payloadNum = 2;
+
+        srcNum = payloadNum + getBaseSrcNum();
       }
       INLINE Register getSrc(const Function &fn, uint32_t ID) const {
-        GBE_ASSERTM(ID < srcNum, "Out-of-bound source register for atomic");
-        if (ID == 0u)
-          return bti;
-        else
-          return fn.getRegister(src, ID -1);
+        GBE_ASSERTM((int)ID < (int)srcNum, "Out-of-bound source register for atomic");
+        if (ID == 0) {
+          return offset;
+        } else if (hasExtraBtiReg() && (int)ID == (int)srcNum-1) {
+          return getBtiReg();
+        } else {
+          return fn.getRegister(payload, ID - 1);
+        }
       }
       INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
-        GBE_ASSERTM(ID < srcNum, "Out-of-bound source register for atomic");
-        if (ID == 0u)
-          bti = reg;
-        else
-          fn.setRegister(src, ID - 1, reg);
+        GBE_ASSERTM((int)ID < (int)srcNum, "Out-of-bound source register for atomic");
+        if (ID == 0) {
+          offset = reg;
+        } else if (hasExtraBtiReg() && (int)ID == (int)srcNum - 1) {
+          setBtiReg(reg);
+        } else {
+          fn.setRegister(payload, ID - 1, reg);
+        }
       }
       INLINE uint32_t getSrcNum(void) const { return srcNum; }
 
-      INLINE AddressSpace getAddressSpace(void) const { return this->addrSpace; }
-      INLINE Register getBTI(void) const { return bti; }
-      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
       INLINE AtomicOps getAtomicOpcode(void) const { return this->atomicOp; }
       INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       Register dst[1];
-      Tuple src;
-      AddressSpace addrSpace; //!< Address space
-      Register bti;               //!< bti
-      uint8_t fixedBTI:1;      //!< fixed bti or not
+      Tuple payload;
       uint8_t srcNum:3;     //!<Source Number
       AtomicOps atomicOp:6;     //!<Source Number
     };
@@ -428,119 +472,102 @@ namespace ir {
       Register dst[0];       //!< No destination
     };
 
+
     class ALIGNED_INSTRUCTION LoadInstruction :
-      public BasePolicy,
-      public NSrcPolicy<LoadInstruction, 2>
+      public MemInstruction
     {
-    public:
-      LoadInstruction(Type type,
-                      Tuple dstValues,
-                      Register offset,
-                      AddressSpace addrSpace,
-                      uint32_t valueNum,
-                      bool dwAligned,
-                      bool fixedBTI,
-                      Register bti)
-      {
-        GBE_ASSERT(valueNum < 128);
-        this->opcode = OP_LOAD;
-        this->type = type;
-        this->offset = offset;
-        this->values = dstValues;
-        this->addrSpace = addrSpace;
-        this->valueNum = valueNum;
-        this->dwAligned = dwAligned ? 1 : 0;
-        this->fixedBTI = fixedBTI ? 1 : 0;
-        this->bti = bti;
-      }
-      INLINE Register getDst(const Function &fn, uint32_t ID) const {
-        GBE_ASSERTM(ID < valueNum, "Out-of-bound source register");
-        return fn.getRegister(values, ID);
-      }
-      INLINE void setDst(Function &fn, uint32_t ID, Register reg) {
-        GBE_ASSERTM(ID < valueNum, "Out-of-bound source register");
-        fn.setRegister(values, ID, reg);
-      }
-      INLINE uint32_t getDstNum(void) const { return valueNum; }
-      INLINE Type getValueType(void) const { return type; }
-      INLINE uint32_t getValueNum(void) const { return valueNum; }
-      INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
-      INLINE Register getBTI(void) const { return bti; }
-      INLINE bool wellFormed(const Function &fn, std::string &why) const;
-      INLINE void out(std::ostream &out, const Function &fn) const;
-      INLINE bool isAligned(void) const { return !!dwAligned; }
-      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
-      Type type;              //!< Type to store
-      Register src[0];        //!< Address where to load from
-      Register bti;
-      Register offset;        //!< Alias to make it similar to store
-      Tuple values;           //!< Values to load
-      AddressSpace addrSpace; //!< Where to load
-      uint8_t fixedBTI:1;
-      uint8_t valueNum:7;     //!< Number of values to load
-      uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
-    };
+      public:
+        LoadInstruction(Type type,
+                        Tuple dstValues,
+                        Register offset,
+                        AddressSpace AS,
+                        uint32_t _valueNum,
+                        bool dwAligned,
+                        AddressMode AM)
+                      : MemInstruction(AM, AS, dwAligned, type, offset),
+                        valueNum(_valueNum),
+                        values(dstValues)
+        {
+          this->opcode = OP_LOAD;
+        }
 
+        INLINE unsigned getSrcNum() const { return getBaseSrcNum(); }
+        INLINE Register getSrc(const Function &fn, unsigned id) const {
+          if (id == 0) return offset;
+          if (hasExtraBtiReg() && id == 1) return BtiReg;
+          assert(0 && "LoadInstruction::getSrc() out-of-range");
+          return ir::Register(0);
+        }
+        INLINE void     setSrc(Function &fn, unsigned id, Register reg) {
+          assert(id < getSrcNum());
+          if (id == 0) { offset = reg;   return; }
+          if (id == 1) { setBtiReg(reg); return; }
+        }
+        INLINE unsigned getDstNum() const { return valueNum; }
+        INLINE Register getDst(const Function &fn, unsigned id) const {
+          assert(id < valueNum);
+          return fn.getRegister(values, id);
+        }
+        INLINE void     setDst(Function &fn, unsigned id, Register reg) {
+          assert(id < getDstNum());
+          fn.setRegister(values, id, reg);
+        }
+        INLINE uint32_t getValueNum(void) const { return valueNum; }
+        INLINE Register getValue(const Function &fn, unsigned id) const {
+          assert(id < valueNum);
+          return fn.getRegister(values, id);
+        }
+        INLINE bool wellFormed(const Function &fn, std::string &why) const;
+        INLINE void out(std::ostream &out, const Function &fn) const;
+
+        uint8_t         valueNum;
+        Tuple             values;
+    };
     class ALIGNED_INSTRUCTION StoreInstruction :
-      public BasePolicy, public NDstPolicy<StoreInstruction, 0>
+      public MemInstruction,
+      public NDstPolicy<StoreInstruction, 0>
     {
-    public:
-      StoreInstruction(Type type,
-                       Tuple values,
-                       Register offset,
-                       AddressSpace addrSpace,
-                       uint32_t valueNum,
-                       bool dwAligned,
-                       bool fixedBTI,
-                       Register bti)
-      {
-        GBE_ASSERT(valueNum < 255);
-        this->opcode = OP_STORE;
-        this->type = type;
-        this->offset = offset;
-        this->values = values;
-        this->addrSpace = addrSpace;
-        this->valueNum = valueNum;
-        this->dwAligned = dwAligned ? 1 : 0;
-        this->fixedBTI = fixedBTI ? 1 : 0;
-        this->bti = bti;
-      }
-      INLINE Register getSrc(const Function &fn, uint32_t ID) const {
-        GBE_ASSERTM(ID < valueNum + 2u, "Out-of-bound source register for store");
-        if (ID == 0u)
-          return bti;
-        else if (ID == 1u)
-          return offset;
-        else
-          return fn.getRegister(values, ID - 2);
-      }
-      INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
-        GBE_ASSERTM(ID < valueNum + 2u, "Out-of-bound source register for store");
-        if (ID == 0u)
-          bti = reg;
-        else if (ID == 1u)
-          offset = reg;
-        else
-          fn.setRegister(values, ID - 2, reg);
-      }
-      INLINE uint32_t getSrcNum(void) const { return valueNum + 2u; }
-      INLINE uint32_t getValueNum(void) const { return valueNum; }
-      INLINE Type getValueType(void) const { return type; }
-      INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
-      INLINE Register getBTI(void) const { return bti; }
-      INLINE bool wellFormed(const Function &fn, std::string &why) const;
-      INLINE void out(std::ostream &out, const Function &fn) const;
-      INLINE bool isAligned(void) const { return !!dwAligned; }
-      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
-      Type type;              //!< Type to store
-      Register bti;
-      Register offset;        //!< First source is the offset where to store
-      Tuple values;           //!< Values to store
-      AddressSpace addrSpace; //!< Where to store
-      uint8_t fixedBTI:1;                //!< Which btis need access
-      uint8_t valueNum:7;     //!< Number of values to store
-      uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
-      Register dst[0];        //!< No destination
+      public:
+        StoreInstruction(Type type,
+                         Tuple values,
+                         Register offset,
+                         AddressSpace addrSpace,
+                         uint32_t valueNum,
+                         bool dwAligned,
+                         AddressMode AM)
+          : MemInstruction(AM, addrSpace, dwAligned, type, offset)
+        {
+          this->opcode = OP_STORE;
+          this->values = values;
+          this->valueNum = valueNum;
+        }
+        INLINE unsigned getValueNum()      const { return valueNum; }
+        INLINE Register getValue(const Function &fn, unsigned id) const {
+          return fn.getRegister(values, id);
+        }
+        INLINE unsigned getSrcNum()        const { return getBaseSrcNum() + valueNum; }
+        INLINE Register getSrc(const Function &fn, unsigned id) const {
+          if (id == 0)  return offset;
+          if (id <= valueNum) return fn.getRegister(values, id-1);
+          if (hasExtraBtiReg() && (int)id == (int)valueNum+1) return getBtiReg();
+          assert(0 && "StoreInstruction::getSrc() out-of-range");
+          return Register(0);
+        }
+        INLINE void     setSrc(Function &fn, unsigned id, Register reg) {
+          if (id == 0)                   { offset = reg; return; }
+          if (id > 0 && id <= valueNum)  { fn.setRegister(values, id-1, reg); return; }
+          if (hasExtraBtiReg() &&
+              (int)id == (int)valueNum + 1)        {
+            setBtiReg(reg);
+            return;
+          }
+          assert(0 && "StoreInstruction::setSrc() index out-of-range");
+        }
+        INLINE bool wellFormed(const Function &fn, std::string &why) const;
+        INLINE void out(std::ostream &out, const Function &fn) const;
+        Register      dst[0];
+        uint8_t     valueNum;
+        Tuple         values;
     };
 
     class ALIGNED_INSTRUCTION SampleInstruction : // TODO
@@ -1037,8 +1064,6 @@ namespace ir {
         if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID+1u), fn, whyNot) == false))
           return false;
 
-      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, bti, fn, whyNot) == false))
-        return false;
       return true;
     }
 
@@ -1065,7 +1090,7 @@ namespace ir {
     template <typename T>
     INLINE bool wellFormedLoadStore(const T &insn, const Function &fn, std::string &whyNot)
     {
-      if (UNLIKELY(insn.offset >= fn.regNum())) {
+      if (UNLIKELY(insn.getAddressRegister() >= fn.regNum())) {
         whyNot = "Out-of-bound offset register index";
         return false;
       }
@@ -1073,10 +1098,11 @@ namespace ir {
         whyNot = "Out-of-bound tuple index";
         return false;
       }
+
       // Check all registers
-      const RegisterFamily family = getFamily(insn.type);
-      for (uint32_t valueID = 0; valueID < insn.valueNum; ++valueID) {
-        const Register regID = fn.getRegister(insn.values, valueID);
+      const RegisterFamily family = getFamily(insn.getValueType());
+      for (uint32_t valueID = 0; valueID < insn.getValueNum(); ++valueID) {
+        const Register regID = insn.getValue(fn, valueID);;
         if (UNLIKELY(checkRegisterData(family, regID, fn, whyNot) == false))
           return false;
       }
@@ -1260,12 +1286,18 @@ namespace ir {
 
     INLINE void AtomicInstruction::out(std::ostream &out, const Function &fn) const {
       this->outOpcode(out);
-      out << "." << addrSpace;
+      out << "." << AS;
       out << " %" << this->getDst(fn, 0);
-      out << " {" << "%" << this->getSrc(fn, 1) << "}";
-      for (uint32_t i = 2; i < srcNum; ++i)
+      out << " {" << "%" << this->getSrc(fn, 0) << "}";
+      for (uint32_t i = 1; i < srcNum; ++i)
         out << " %" << this->getSrc(fn, i);
-      out <<  (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
+      AddressMode am = this->getAddressMode();
+      out << " bti:";
+      if ( am == AM_DynamicBti) {
+        out << " %" << this->getBtiReg();
+      } else {
+        out << this->getSurfaceIndex();
+      }
     }
 
 
@@ -1294,23 +1326,35 @@ namespace ir {
 
     INLINE void LoadInstruction::out(std::ostream &out, const Function &fn) const {
       this->outOpcode(out);
-      out << "." << type << "." << addrSpace << (dwAligned ? "." : ".un") << "aligned";
+      out << "." << type << "." << AS << (dwAligned ? "." : ".un") << "aligned";
       out << " {";
       for (uint32_t i = 0; i < valueNum; ++i)
         out << "%" << this->getDst(fn, i) << (i != (valueNum-1u) ? " " : "");
       out << "}";
-      out << " %" << this->getSrc(fn, 1);
-      out << (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
+      out << " %" << this->getSrc(fn, 0);
+      AddressMode am = this->getAddressMode();
+      out << " bti:";
+      if ( am == AM_DynamicBti) {
+        out << " %" << this->getBtiReg();
+      } else {
+        out << this->getSurfaceIndex();
+      }
     }
 
     INLINE void StoreInstruction::out(std::ostream &out, const Function &fn) const {
       this->outOpcode(out);
-      out << "." << type << "." << addrSpace << (dwAligned ? "." : ".un") << "aligned";
-      out << " %" << this->getSrc(fn, 1) << " {";
+      out << "." << type << "." << AS << (dwAligned ? "." : ".un") << "aligned";
+      out << " %" << this->getSrc(fn, 0) << " {";
       for (uint32_t i = 0; i < valueNum; ++i)
-        out << "%" << this->getSrc(fn, i+2) << (i != (valueNum-1u) ? " " : "");
+        out << "%" << this->getSrc(fn, i+1) << (i != (valueNum-1u) ? " " : "");
       out << "}";
-      out <<  (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
+      AddressMode am = this->getAddressMode();
+      out << " bti:";
+      if ( am == AM_DynamicBti) {
+        out << " %" << this->getBtiReg();
+      } else {
+        out << this->getSurfaceIndex();
+      }
     }
 
     INLINE void ReadARFInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1664,19 +1708,17 @@ DECL_MEM_FN(BitCastInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(BitCastInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
-DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(MemInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(MemInstruction, AddressMode, getAddressMode(void), getAddressMode())
+DECL_MEM_FN(MemInstruction, Register, getAddressRegister(void), getAddressRegister())
+DECL_MEM_FN(MemInstruction, Register, getBtiReg(void), getBtiReg())
+DECL_MEM_FN(MemInstruction, unsigned, getSurfaceIndex(void), getSurfaceIndex())
+DECL_MEM_FN(MemInstruction, Type,     getValueType(void), getValueType())
+DECL_MEM_FN(MemInstruction, bool,     isAligned(void), isAligned())
+DECL_MEM_FN(MemInstruction, unsigned, getAddressIndex(void), getAddressIndex())
 DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode())
-DECL_MEM_FN(AtomicInstruction, bool, isFixedBTI(void), isFixedBTI())
-DECL_MEM_FN(StoreInstruction, Type, getValueType(void), getValueType())
 DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
-DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
-DECL_MEM_FN(StoreInstruction, bool, isAligned(void), isAligned())
-DECL_MEM_FN(StoreInstruction, bool, isFixedBTI(void), isFixedBTI())
-DECL_MEM_FN(LoadInstruction, Type, getValueType(void), getValueType())
 DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
-DECL_MEM_FN(LoadInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
-DECL_MEM_FN(LoadInstruction, bool, isAligned(void), isAligned())
-DECL_MEM_FN(LoadInstruction, bool, isFixedBTI(void), isFixedBTI())
 DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
 DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
 DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated())
@@ -1702,6 +1744,15 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
 
 #undef DECL_MEM_FN
 
+#define DECL_MEM_FN(CLASS, RET, PROTOTYPE, CALL) \
+  RET CLASS::PROTOTYPE { \
+    return reinterpret_cast<internal::CLASS*>(this)->CALL; \
+  }
+DECL_MEM_FN(MemInstruction, void,     setSurfaceIndex(unsigned id), setSurfaceIndex(id))
+DECL_MEM_FN(MemInstruction, void,     setBtiReg(Register reg), setBtiReg(reg))
+
+#undef DECL_MEM_FN
+
   Immediate LoadImmInstruction::getImmediate(void) const {
     const Function &fn = this->getFunction();
     return reinterpret_cast<const internal::LoadImmInstruction*>(this)->getImmediate(fn);
@@ -1843,8 +1894,16 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   }
 
   // For all unary functions with given opcode
-  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, Register bti, bool fixedBTI, Tuple src) {
-    return internal::AtomicInstruction(atomicOp, dst, space, bti, fixedBTI, src).convert();
+  Instruction ATOMIC(AtomicOps atomicOp, Type type, Register dst, AddressSpace space, Register address, Tuple payload, AddressMode AM, Register bti) {
+    internal::AtomicInstruction insn = internal::AtomicInstruction(atomicOp, type, dst, space, address, payload, AM);
+    insn.setBtiReg(bti);
+    return insn.convert();
+  }
+
+  Instruction ATOMIC(AtomicOps atomicOp, Type type, Register dst, AddressSpace space, Register address, Tuple payload, AddressMode AM, unsigned SurfaceIndex) {
+    internal::AtomicInstruction insn = internal::AtomicInstruction(atomicOp, type, dst, space, address, payload, AM);
+    insn.setSurfaceIndex(SurfaceIndex);
+    return insn.convert();
   }
 
   // BRA
@@ -1892,10 +1951,25 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
                    AddressSpace space, \
                    uint32_t valueNum, \
                    bool dwAligned, \
-                   bool fixedBTI, \
+                   AddressMode AM, \
+                   unsigned SurfaceIndex) \
+  { \
+    internal::CLASS insn = internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM); \
+    insn.setSurfaceIndex(SurfaceIndex);\
+    return insn.convert(); \
+  } \
+  Instruction NAME(Type type, \
+                   Tuple tuple, \
+                   Register offset, \
+                   AddressSpace space, \
+                   uint32_t valueNum, \
+                   bool dwAligned, \
+                   AddressMode AM, \
                    Register bti) \
   { \
-    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,fixedBTI,bti).convert(); \
+    internal::CLASS insn = internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM); \
+    insn.setBtiReg(bti); \
+    return insn.convert(); \
   }
 
   DECL_EMIT_FUNCTION(LOAD, LoadInstruction)
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 3f3c655..b8f95ba 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -65,6 +65,13 @@ namespace ir {
     MEM_INVALID
   };
 
+  enum AddressMode : uint8_t {
+    AM_DynamicBti = 0,
+    AM_Stateless,
+    AM_StaticBti,
+    AM_INVALID
+  };
+
   enum AtomicOps {
     ATOMIC_OP_AND       = 1,
     ATOMIC_OP_OR        = 2,
@@ -288,20 +295,30 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  class MemInstruction : public Instruction {
+  public:
+    unsigned getSurfaceIndex() const;
+    unsigned getAddressIndex() const;
+    /*! Address space that is manipulated here */
+    AddressMode getAddressMode() const;
+    Register getBtiReg() const;
+    /*! Return the register that contains the addresses */
+    Register getAddressRegister() const;
+    AddressSpace getAddressSpace() const;
+    /*! Return the types of the values */
+    Type getValueType() const;
+    bool isAligned(void) const;
+    void setBtiReg(Register reg);
+    void setSurfaceIndex(unsigned idx);
+  };
+
   /*! Atomic instruction */
-  class AtomicInstruction : public Instruction {
+  class AtomicInstruction : public MemInstruction {
   public:
     /*! Where the address register goes */
-    static const uint32_t btiIndex = 0;
-    static const uint32_t addressIndex = 1;
-    /*! Address space that is manipulated here */
-    AddressSpace getAddressSpace(void) const;
-    Register getBTI(void) const { return this->getSrc(btiIndex); }
-    bool isFixedBTI(void) const;
+    static const uint32_t addressIndex = 0;
     /*! Return the atomic function code */
     AtomicOps getAtomicOpcode(void) const;
-    /*! Return the register that contains the addresses */
-    INLINE Register getAddress(void) const { return this->getSrc(addressIndex); }
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
   };
@@ -309,27 +326,15 @@ namespace ir {
   /*! Store instruction. First source is the address. Next sources are the
    *  values to store contiguously at the given address
    */
-  class StoreInstruction : public Instruction {
+  class StoreInstruction : public MemInstruction {
   public:
     /*! Where the address register goes */
-    static const uint32_t btiIndex = 0;
-    static const uint32_t addressIndex = 1;
-    /*! Return the types of the values to store */
-    Type getValueType(void) const;
-    /*! Give the number of values the instruction is storing (srcNum-1) */
+    static const uint32_t addressIndex = 0;
     uint32_t getValueNum(void) const;
-    Register getBTI(void) const { return this->getSrc(btiIndex); }
-    bool isFixedBTI(void) const;
-    /*! Address space that is manipulated here */
-    AddressSpace getAddressSpace(void) const;
-    /*! DWORD aligned means untyped read for Gen. That is what matters */
-    bool isAligned(void) const;
-    /*! Return the register that contains the addresses */
-    INLINE Register getAddress(void) const { return this->getSrc(addressIndex); }
     /*! Return the register that contain value valueID */
     INLINE Register getValue(uint32_t valueID) const {
       GBE_ASSERT(valueID < this->getValueNum());
-      return this->getSrc(valueID + 2u);
+      return this->getSrc(valueID + 1u);
     }
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
@@ -339,20 +344,10 @@ namespace ir {
    *  The multiple destinations are the contiguous values loaded at the given
    *  address
    */
-  class LoadInstruction : public Instruction {
+  class LoadInstruction : public MemInstruction {
   public:
-    /*! Type of the loaded values (ie type of all the destinations) */
-    Type getValueType(void) const;
     /*! Number of values loaded (ie number of destinations) */
     uint32_t getValueNum(void) const;
-    /*! Address space that is manipulated here */
-    AddressSpace getAddressSpace(void) const;
-    /*! DWORD aligned means untyped read for Gen. That is what matters */
-    bool isAligned(void) const;
-    /*! Return the register that contains the addresses */
-    INLINE Register getAddress(void) const { return this->getSrc(1u); }
-    Register getBTI(void) const {return this->getSrc(0u);}
-    bool isFixedBTI(void) const;
     /*! Return the register that contain value valueID */
     INLINE Register getValue(uint32_t valueID) const {
       return this->getDst(valueID);
@@ -725,7 +720,8 @@ namespace ir {
   /*! F32TO16.{dstType <- srcType} dst src */
   Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src);
   /*! atomic dst addr.space {src1 {src2}} */
-  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, Register bti, bool fixedBTI, Tuple src);
+  Instruction ATOMIC(AtomicOps opcode, Type, Register dst, AddressSpace space, Register ptr, Tuple payload, AddressMode, unsigned);
+  Instruction ATOMIC(AtomicOps opcode, Type, Register dst, AddressSpace space, Register ptr, Tuple src, AddressMode, Register);
   /*! bra labelIndex */
   Instruction BRA(LabelIndex labelIndex);
   /*! (pred) bra labelIndex */
@@ -740,10 +736,12 @@ namespace ir {
   Instruction WHILE(LabelIndex labelIndex, Register pred);
   /*! ret */
   Instruction RET(void);
-  /*! load.type.space {dst1,...,dst_valueNum} offset value */
-  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, bool fixedBTI, Register bti);
-  /*! store.type.space offset {src1,...,src_valueNum} value */
-  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, bool fixedBTI, Register bti);
+  /*! load.type.space {dst1,...,dst_valueNum} offset value, {bti} */
+  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex);
+  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti);
+  /*! store.type.space offset {src1,...,src_valueNum} value {bti}*/
+  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex);
+  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti);
   /*! loadi.type dst value */
   Instruction LOADI(Type type, Register dst, ImmediateIndex value);
   /*! sync.params... (see Sync instruction) */
diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
index 9fcdf74..66ced8c 100644
--- a/backend/src/ir/lowering.cpp
+++ b/backend/src/ir/lowering.cpp
@@ -320,7 +320,7 @@ namespace ir {
             continue;
 
           IndirectLoad indirectLoad;
-          Register addr = load->getAddress();
+          Register addr = load->getAddressRegister();
           indirectLoad.argID = argID;
           indirectLoad.load = insn;
 
@@ -368,7 +368,7 @@ namespace ir {
 
         const Register reg = load->getValue(valueID);
 
-        Instruction mov = ir::INDIRECT_MOV(type, reg, arg, load->getAddress(), offset);
+        Instruction mov = ir::INDIRECT_MOV(type, reg, arg, load->getAddressRegister(), offset);
         mov.insert(ins_after, &ins_after);
         replaced = true;
       }
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 980996e..d4868a2 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -386,6 +386,36 @@ namespace gbe
     ir::Context &ctx;
   };
 
+  class GenWriter;
+  class MemoryInstHelper {
+    public:
+      MemoryInstHelper(ir::Context &c, ir::Unit &u, GenWriter *w, bool l)
+                : writer(w),
+                  ctx(c),
+                  unit(u),
+                  legacyMode(l)
+                  { }
+      void         emitUnalignedDQLoadStore(Value *llvmValues);
+      ir::Tuple    getValueTuple(llvm::Value *llvmValues, llvm::Type *elemType, unsigned start, unsigned elemNum);
+      void         emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum, Value *llvmValues, Type * elemType);
+      ir::Register getOffsetAddress(ir::Register basePtr, unsigned offset);
+      void         shootMessage(ir::Type type, ir::Register offset, ir::Tuple value, unsigned elemNum);
+      template <bool isLoad, typename T>
+      void         emitLoadOrStore(T &I);
+    private:
+      GenWriter            *writer;
+      bool              legacyMode;
+      ir::AddressSpace   addrSpace;
+      ir::Register            mBTI;
+      ir::Register            mPtr;
+      ir::AddressMode mAddressMode;
+      unsigned        SurfaceIndex;
+      bool                  isLoad;
+      bool               dwAligned;
+      ir::Context             &ctx;
+      ir::Unit               &unit;
+  };
+
   /*! Translate LLVM IR code to Gen IR code */
   class GenWriter : public FunctionPass, public InstVisitor<GenWriter>
   {
@@ -437,6 +467,9 @@ namespace gbe
     Function *Func;
     const Module *TheModule;
     int btiBase;
+    /*! legacyMode is for hardware before BDW,
+     * which do not support stateless memory access */
+    bool legacyMode;
   public:
     static char ID;
     explicit GenWriter(ir::Unit &unit)
@@ -446,7 +479,8 @@ namespace gbe
         regTranslator(ctx),
         LI(0),
         TheModule(0),
-        btiBase(BTI_RESERVED_NUM)
+        btiBase(BTI_RESERVED_NUM),
+        legacyMode(true)
     {
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
       initializeLoopInfoWrapperPassPass(*PassRegistry::getPassRegistry());
@@ -491,7 +525,8 @@ namespace gbe
 
       Func = &F;
       assignBti(F);
-      analyzePointerOrigin(F);
+      if (legacyMode)
+        analyzePointerOrigin(F);
 
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
       LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
@@ -643,6 +678,7 @@ namespace gbe
       ir::ImmediateIndex processSeqConstant(ConstantDataSequential *seq,
                                             int index, ConstTypeId tid);
       ir::ImmediateIndex processConstantVector(ConstantVector *cv, int index);
+      friend class MemoryInstHelper;
   };
 
   char GenWriter::ID = 0;
@@ -3572,47 +3608,55 @@ namespace gbe
     CallSite::arg_iterator AI = CS.arg_begin();
     CallSite::arg_iterator AE = CS.arg_end();
     GBE_ASSERT(AI != AE);
-
-    ir::AddressSpace addrSpace;
-
     Value *llvmPtr = *AI;
-    Value *bti = getBtiRegister(llvmPtr);
-    Value *ptrBase = getPointerBase(llvmPtr);
+    ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace());
     ir::Register pointer = this->getRegister(llvmPtr);
-    ir::Register baseReg = this->getRegister(ptrBase);
 
+    ir::Register ptr;
     ir::Register btiReg;
-    bool fixedBTI = false;
-    if (isa<ConstantInt>(bti)) {
-      fixedBTI = true;
-      unsigned index = cast<ConstantInt>(bti)->getZExtValue();
-      addrSpace = btiToGen(index);
-      ir::ImmediateIndex immIndex = ctx.newImmediate((uint32_t)index);
-      btiReg = ctx.reg(ir::FAMILY_DWORD);
-      ctx.LOADI(ir::TYPE_U32, btiReg, immIndex);
+    unsigned SurfaceIndex = 0xff;;
+
+    ir::AddressMode AM;
+    if (legacyMode) {
+      Value *bti = getBtiRegister(llvmPtr);
+      Value *ptrBase = getPointerBase(llvmPtr);
+      ir::Register baseReg = this->getRegister(ptrBase);
+      if (isa<ConstantInt>(bti)) {
+        AM = ir::AM_StaticBti;
+        SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
+        addrSpace = btiToGen(SurfaceIndex);
+      } else {
+        AM = ir::AM_DynamicBti;
+        addrSpace = ir::MEM_MIXED;
+        btiReg = this->getRegister(bti);
+      }
+      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+      ptr = ctx.reg(pointerFamily);
+      ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
     } else {
-      addrSpace = ir::MEM_MIXED;
-      btiReg = this->getRegister(bti);
+      AM = ir::AM_Stateless;
+      ptr = pointer;
     }
 
-    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
-    const ir::Register ptr = ctx.reg(pointerFamily);
-    ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
-
     const ir::Register dst = this->getRegister(&I);
 
-    uint32_t srcNum = 0;
-    vector<ir::Register> src;
-    src.push_back(ptr);
-    srcNum++;
+    uint32_t payloadNum = 0;
+    vector<ir::Register> payload;
     AI++;
 
     while(AI != AE) {
-      src.push_back(this->getRegister(*(AI++)));
-      srcNum++;
+      payload.push_back(this->getRegister(*(AI++)));
+      payloadNum++;
+    }
+    ir::Type type = getType(ctx, llvmPtr->getType()->getPointerElementType());
+    const ir::Tuple payloadTuple = payloadNum == 0 ?
+                                   ir::Tuple(0) :
+                                   ctx.arrayTuple(&payload[0], payloadNum);
+    if (AM == ir::AM_DynamicBti) {
+      ctx.ATOMIC(opcode, type, dst, addrSpace, ptr, payloadTuple, AM, btiReg);
+    } else {
+      ctx.ATOMIC(opcode, type, dst, addrSpace, ptr, payloadTuple, AM, SurfaceIndex);
     }
-    const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
-    ctx.ATOMIC(opcode, dst, addrSpace, btiReg, fixedBTI, srcTuple);
   }
 
   /* append a new sampler. should be called before any reference to
@@ -4325,65 +4369,82 @@ namespace gbe
     this->newRegister(&I);
   }
   void GenWriter::regAllocateStoreInst(StoreInst &I) {}
+  void GenWriter::emitLoadInst(LoadInst &I) {
+    MemoryInstHelper *h = new MemoryInstHelper(ctx, unit, this, legacyMode);
+    h->emitLoadOrStore<true>(I);
+    delete h;
+  }
 
-  void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
-                                      Value *llvmValues, const ir::Register ptr,
-                                      const ir::AddressSpace addrSpace,
-                                      Type * elemType, bool isLoad, ir::Register bti,
-                                      bool dwAligned, bool fixedBTI) {
-    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
-    uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
-    uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
-    const uint32_t perMsgNum = elemNum / msgNum;
+  void GenWriter::emitStoreInst(StoreInst &I) {
+    MemoryInstHelper *h = new MemoryInstHelper(ctx, unit, this, legacyMode);
+    h->emitLoadOrStore<false>(I);
+    delete h;
+  }
 
-    for (uint32_t msg = 0; msg < msgNum; ++msg) {
-      // Build the tuple data in the vector
+  llvm::FunctionPass *createGenPass(ir::Unit &unit) {
+    return new GenWriter(unit);
+  }
+
+  ir::Tuple MemoryInstHelper::getValueTuple(llvm::Value *llvmValues, llvm::Type *elemType, unsigned start, unsigned elemNum) {
       vector<ir::Register> tupleData; // put registers here
-      for (uint32_t elemID = 0; elemID < perMsgNum; ++elemID) {
+      for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
         ir::Register reg;
-        if(regTranslator.isUndefConst(llvmValues, elemID)) {
+        if(writer->regTranslator.isUndefConst(llvmValues, elemID)) {
           Value *v = Constant::getNullValue(elemType);
-          reg = this->getRegister(v);
+          reg = writer->getRegister(v);
         } else
-          reg = this->getRegister(llvmValues, perMsgNum*msg+elemID);
+          reg = writer->getRegister(llvmValues, start + elemID);
 
         tupleData.push_back(reg);
       }
-      const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], perMsgNum);
-
-      // We may need to update to offset the pointer
-      ir::Register addr;
-      if (msg == 0)
-        addr = ptr;
-      else {
-        const ir::Register offset = ctx.reg(pointerFamily);
-        ir::ImmediateIndex immIndex;
-        ir::Type immType;
+      const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
+      return tuple;
+  }
+
+  void MemoryInstHelper::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
+                                      Value *llvmValues,
+                                      Type * elemType) {
+    uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
+    uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
+    const uint32_t perMsgNum = elemNum / msgNum;
+
+    for (uint32_t msg = 0; msg < msgNum; ++msg) {
+      // Build the tuple data in the vector
+     ir::Tuple tuple = getValueTuple(llvmValues, elemType, perMsgNum*msg, perMsgNum);
         // each message can read/write 16 byte
         const int32_t stride = 16;
-        if (pointerFamily == ir::FAMILY_DWORD) {
-          immIndex = ctx.newImmediate(int32_t(msg*stride));
-          immType = ir::TYPE_S32;
-        } else {
-          immIndex = ctx.newImmediate(int64_t(msg*stride));
-          immType = ir::TYPE_S64;
-        }
+      ir::Register addr = getOffsetAddress(mPtr, msg*stride);
+      shootMessage(type, addr, tuple, perMsgNum);
+    }
+  }
+
+  ir::Register MemoryInstHelper::getOffsetAddress(ir::Register basePtr, unsigned offset) {
+    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+    ir::Register addr;
+    if (offset == 0)
+      addr = basePtr;
+    else {
+      const ir::Register offsetReg = ctx.reg(pointerFamily);
+      ir::ImmediateIndex immIndex;
+      ir::Type immType;
 
-        addr = ctx.reg(pointerFamily);
-        ctx.LOADI(immType, offset, immIndex);
-        ctx.ADD(immType, addr, ptr, offset);
+      if (pointerFamily == ir::FAMILY_DWORD) {
+        immIndex = ctx.newImmediate(int32_t(offset));
+        immType = ir::TYPE_S32;
+      } else {
+        immIndex = ctx.newImmediate(int64_t(offset));
+        immType = ir::TYPE_S64;
       }
 
-      // Emit the instruction
-      if (isLoad)
-        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, fixedBTI, bti);
-      else
-        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, fixedBTI, bti);
+      addr = ctx.reg(pointerFamily);
+      ctx.LOADI(immType, offsetReg, immIndex);
+      ctx.ADD(immType, addr, basePtr, offsetReg);
     }
+    return addr;
   }
 
   // handle load of dword/qword with unaligned address
-  void GenWriter::emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI)
+  void MemoryInstHelper::emitUnalignedDQLoadStore(Value *llvmValues)
   {
     Type *llvmType = llvmValues->getType();
     const ir::Type type = getType(ctx, llvmType);
@@ -4397,19 +4458,7 @@ namespace gbe
       elemNum = vectorType->getNumElements();
     }
 
-    vector<ir::Register> tupleData;
-    for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-      ir::Register reg;
-      if(regTranslator.isUndefConst(llvmValues, elemID)) {
-        Value *v = Constant::getNullValue(elemType);
-        reg = this->getRegister(v);
-      } else
-        reg = this->getRegister(llvmValues, elemID);
-
-      tupleData.push_back(reg);
-    }
-    const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
-
+    ir::Tuple tuple = getValueTuple(llvmValues, elemType, 0, elemNum);
     vector<ir::Register> byteTupleData;
     for (uint32_t elemID = 0; elemID < byteSize; ++elemID) {
       byteTupleData.push_back(ctx.reg(ir::FAMILY_BYTE));
@@ -4417,96 +4466,81 @@ namespace gbe
     const ir::Tuple byteTuple = ctx.arrayTuple(&byteTupleData[0], byteSize);
 
     if (isLoad) {
-      ctx.LOAD(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, fixedBTI, bti);
+      shootMessage(ir::TYPE_U8, mPtr, byteTuple, byteSize);
       ctx.BITCAST(type, ir::TYPE_U8, tuple, byteTuple, elemNum, byteSize);
     } else {
       ctx.BITCAST(ir::TYPE_U8, type, byteTuple, tuple, byteSize, elemNum);
       // FIXME: byte scatter does not handle correctly vector store, after fix that,
       //        we can directly use on store instruction like:
       //        ctx.STORE(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, fixedBTI, bti);
-      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
       for (uint32_t elemID = 0; elemID < byteSize; elemID++) {
-        const ir::Register reg = byteTupleData[elemID];
-        ir::Register addr;
-        if (elemID == 0)
-          addr = ptr;
-        else {
-          const ir::Register offset = ctx.reg(pointerFamily);
-          ir::ImmediateIndex immIndex;
-          immIndex = ctx.newImmediate(int32_t(elemID));
-          addr = ctx.reg(pointerFamily);
-          ctx.LOADI(ir::TYPE_S32, offset, immIndex);
-          ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
-        }
-       ctx.STORE(type, addr, addrSpace, dwAligned, fixedBTI, bti, reg);
+        const ir::Register addr = getOffsetAddress(mPtr, elemID);
+        const ir::Tuple value = ctx.arrayTuple(&byteTupleData[elemID], 1);
+        shootMessage(ir::TYPE_U8, addr, value, 1);
       }
     }
   }
 
-  extern int OCL_SIMD_WIDTH;
   template <bool isLoad, typename T>
-  INLINE void GenWriter::emitLoadOrStore(T &I)
-  {
+  void MemoryInstHelper::emitLoadOrStore(T &I) {
     Value *llvmPtr = I.getPointerOperand();
     Value *llvmValues = getLoadOrStoreValue(I);
     Type *llvmType = llvmValues->getType();
-    const bool dwAligned = (I.getAlignment() % 4) == 0;
-    ir::AddressSpace addrSpace;
-    const ir::Register pointer = this->getRegister(llvmPtr);
+    dwAligned = (I.getAlignment() % 4) == 0;
+    addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace());
+    const ir::Register pointer = writer->getRegister(llvmPtr);
     const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
 
-    Value *bti = getBtiRegister(llvmPtr);
-    Value *ptrBase = getPointerBase(llvmPtr);
-    ir::Register baseReg = this->getRegister(ptrBase);
-    bool zeroBase = false;
-    if (isa<ConstantPointerNull>(ptrBase)) {
-      zeroBase = true;
-    }
-
-    ir::Register btiReg;
-    bool fixedBTI = false;
-    if (isa<ConstantInt>(bti)) {
-      fixedBTI = true;
-      unsigned index = cast<ConstantInt>(bti)->getZExtValue();
-      addrSpace = btiToGen(index);
-      ir::ImmediateIndex immIndex = ctx.newImmediate((uint32_t)index);
-      btiReg = ctx.reg(ir::FAMILY_DWORD);
-      ctx.LOADI(ir::TYPE_U32, btiReg, immIndex);
-    } else {
-      addrSpace = ir::MEM_MIXED;
-      btiReg = this->getRegister(bti);
-    }
-
+    this->isLoad = isLoad;
     Type *scalarType = llvmType;
     if (!isScalarType(llvmType)) {
       VectorType *vectorType = cast<VectorType>(llvmType);
       scalarType = vectorType->getElementType();
     }
 
-    ir::Register ptr = ctx.reg(pointerFamily);
-    // FIXME: avoid subtraction zero at this stage is not a good idea,
-    // but later ArgumentLower pass need to match exact load/addImm pattern
-    // so, I avoid subtracting zero base to satisfy ArgumentLower pass.
-    if (!zeroBase)
-      ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
-    else
-      ptr = pointer;
+    // calculate bti and pointer operand
+    if (legacyMode) {
+      Value *bti = writer->getBtiRegister(llvmPtr);
+      Value *ptrBase = writer->getPointerBase(llvmPtr);
+      ir::Register baseReg = writer->getRegister(ptrBase);
+      bool zeroBase = isa<ConstantPointerNull>(ptrBase) ? true : false;
+
+      if (isa<ConstantInt>(bti)) {
+        SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
+        addrSpace = btiToGen(SurfaceIndex);
+        mAddressMode = ir::AM_StaticBti;
+      } else {
+        addrSpace = ir::MEM_MIXED;
+        mBTI = writer->getRegister(bti);
+        mAddressMode = ir::AM_DynamicBti;
+      }
+      mPtr = ctx.reg(pointerFamily);
+
+      // FIXME: avoid subtraction zero at this stage is not a good idea,
+      // but later ArgumentLower pass need to match exact load/addImm pattern
+      // so, I avoid subtracting zero base to satisfy ArgumentLower pass.
+      if (!zeroBase)
+        ctx.SUB(ir::TYPE_U32, mPtr, pointer, baseReg);
+      else
+        mPtr = pointer;
+    } else {
+      mPtr = pointer;
+      SurfaceIndex = 0xff;
+      mAddressMode = ir::AM_Stateless;
+    }
 
     if (!dwAligned
        && (scalarType == IntegerType::get(I.getContext(), 64)
-          || scalarType == IntegerType::get(I.getContext(), 32))
-       ) {
-      emitUnalignedDQLoadStore(ptr, llvmValues, addrSpace, btiReg, isLoad, dwAligned, fixedBTI);
+          || scalarType == IntegerType::get(I.getContext(), 32))) {
+      emitUnalignedDQLoadStore(llvmValues);
       return;
     }
     // Scalar is easy. We neednot build register tuples
     if (isScalarType(llvmType) == true) {
       const ir::Type type = getType(ctx, llvmType);
-      const ir::Register values = this->getRegister(llvmValues);
-      if (isLoad)
-        ctx.LOAD(type, ptr, addrSpace, dwAligned, fixedBTI, btiReg, values);
-      else
-        ctx.STORE(type, ptr, addrSpace, dwAligned, fixedBTI, btiReg, values);
+      const ir::Register values = writer->getRegister(llvmValues);
+      const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
+      shootMessage(type, mPtr, tuple, 1);
     }
     // A vector type requires to build a tuple
     else {
@@ -4522,7 +4556,7 @@ namespace gbe
       // And the llvm does cast a type3 data to type4 for load/store instruction,
       // so a 4 elements vector may only have 3 valid elements. We need to fix it to correct element
       // count here.
-      if (elemNum == 4 && regTranslator.isUndefConst(llvmValues, 3))
+      if (elemNum == 4 && writer->regTranslator.isUndefConst(llvmValues, 3))
           elemNum = 3;
 
       // The code is going to be fairly different from types to types (based on
@@ -4533,72 +4567,44 @@ namespace gbe
       if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT) {
         // One message is enough here. Nothing special to do
         if (elemNum <= 4) {
-          // Build the tuple data in the vector
-          vector<ir::Register> tupleData; // put registers here
-          for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-            ir::Register reg;
-            if(regTranslator.isUndefConst(llvmValues, elemID)) {
-              Value *v = Constant::getNullValue(elemType);
-              reg = this->getRegister(v);
-            } else
-              reg = this->getRegister(llvmValues, elemID);
-
-            tupleData.push_back(reg);
-          }
-          const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
-
-          // Emit the instruction
-          if (isLoad)
-            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, fixedBTI, btiReg);
-          else
-            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, fixedBTI, btiReg);
+          ir::Tuple tuple = getValueTuple(llvmValues, elemType, 0, elemNum);
+          shootMessage(type, mPtr, tuple, elemNum);
         }
-        // Not supported by the hardware. So, we split the message and we use
-        // strided loads and stores
         else {
-          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, btiReg, dwAligned, fixedBTI);
+          emitBatchLoadOrStore(type, elemNum, llvmValues, elemType);
         }
       }
       else if((dataFamily == ir::FAMILY_WORD && (isLoad || elemNum % 2 == 0)) ||
               (dataFamily == ir::FAMILY_BYTE && (isLoad || elemNum % 4 == 0))) {
-          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, btiReg, dwAligned, fixedBTI);
+          emitBatchLoadOrStore(type, elemNum, llvmValues, elemType);
       } else {
         for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
-          if(regTranslator.isUndefConst(llvmValues, elemID))
+          if(writer->regTranslator.isUndefConst(llvmValues, elemID))
             continue;
 
-          const ir::Register reg = this->getRegister(llvmValues, elemID);
-          ir::Register addr;
-          if (elemID == 0)
-            addr = ptr;
-          else {
-              const ir::Register offset = ctx.reg(pointerFamily);
-              ir::ImmediateIndex immIndex;
-              int elemSize = getTypeByteSize(unit, elemType);
-              immIndex = ctx.newImmediate(int32_t(elemID * elemSize));
-              addr = ctx.reg(pointerFamily);
-              ctx.LOADI(ir::TYPE_S32, offset, immIndex);
-              ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
-          }
-          if (isLoad)
-           ctx.LOAD(type, addr, addrSpace, dwAligned, fixedBTI, btiReg, reg);
-          else
-           ctx.STORE(type, addr, addrSpace, dwAligned, fixedBTI, btiReg, reg);
+          const ir::Register reg = writer->getRegister(llvmValues, elemID);
+          int elemSize = getTypeByteSize(unit, elemType);
+
+          ir::Register addr = getOffsetAddress(mPtr, elemID*elemSize);
+          const ir::Tuple tuple = ctx.arrayTuple(&reg, 1);
+          shootMessage(type, addr, tuple, 1);
         }
       }
     }
   }
 
-  void GenWriter::emitLoadInst(LoadInst &I) {
-    this->emitLoadOrStore<true>(I);
-  }
-
-  void GenWriter::emitStoreInst(StoreInst &I) {
-    this->emitLoadOrStore<false>(I);
-  }
-
-  llvm::FunctionPass *createGenPass(ir::Unit &unit) {
-    return new GenWriter(unit);
+  void MemoryInstHelper::shootMessage(ir::Type type, ir::Register offset, ir::Tuple value, unsigned elemNum) {
+    if (mAddressMode == ir::AM_DynamicBti) {
+      if (isLoad)
+        ctx.LOAD(type, value, offset, addrSpace, elemNum, dwAligned, mAddressMode, mBTI);
+      else
+        ctx.STORE(type, value, offset, addrSpace, elemNum, dwAligned, mAddressMode, mBTI);
+    } else {
+      if (isLoad)
+        ctx.LOAD(type, value, offset, addrSpace, elemNum, dwAligned, mAddressMode, SurfaceIndex);
+      else
+        ctx.STORE(type, value, offset, addrSpace, elemNum, dwAligned, mAddressMode, SurfaceIndex);
+    }
   }
 } /* namespace gbe */
 
-- 
2.3.1



More information about the Beignet mailing list