[Beignet] [PATCH 3/3] GBE: Optimize byte/short load/store using untyped read/write

Ruiling Song ruiling.song at intel.com
Thu Mar 6 00:14:21 PST 2014


Scatter/gather are much worse than untyped read/write. So if we can pack
load/store of char/short to use untyped message, jut do it.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/gen_context.cpp                |   29 +++++
 backend/src/backend/gen_context.hpp                |    2 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |    2 +
 backend/src/backend/gen_insn_selection.cpp         |  120 +++++++++++++++-----
 backend/src/backend/gen_insn_selection.hxx         |    2 +
 backend/src/backend/gen_register.hpp               |   22 ++++
 backend/src/ir/instruction.hpp                     |    3 +-
 backend/src/llvm/llvm_gen_backend.cpp              |  112 ++++++++++--------
 8 files changed, 214 insertions(+), 78 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 7104e81..9689ac5 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1776,6 +1776,35 @@ namespace gbe
     p->BYTE_SCATTER(src, bti, elemSize);
   }
 
+  void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    for(uint32_t i = 0; i < insn.dstNum; i++) {
+      p->MOV(ra->genReg(insn.dst(i)), GenRegister::splitReg(src, insn.dstNum, i));
+    }
+  }
+
+  void GenContext::emitPackByteInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    p->push();
+    if(simdWidth == 8) {
+      for(uint32_t i = 0; i < insn.srcNum; i++)
+        p->MOV(GenRegister::splitReg(dst, insn.srcNum, i), ra->genReg(insn.src(i)));
+    } else {
+      // when destination expands two registers, the source must span two registers.
+      p->curr.execWidth = 8;
+      for(uint32_t i = 0; i < insn.srcNum; i++) {
+        GenRegister dsti = GenRegister::splitReg(dst, insn.srcNum, i);
+        GenRegister src = ra->genReg(insn.src(i));
+
+        p->curr.quarterControl = 0;
+        p->MOV(dsti, src);
+        p->curr.quarterControl = 1;
+        p->MOV(GenRegister::Qn(dsti,1), GenRegister::Qn(src, 1));
+      }
+    }
+    p->pop();
+  }
+
   void GenContext::emitDWordGatherInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 6cfc295..a853731 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -142,6 +142,8 @@ namespace gbe
     void emitAtomicInstruction(const SelectionInstruction &insn);
     void emitByteGatherInstruction(const SelectionInstruction &insn);
     void emitByteScatterInstruction(const SelectionInstruction &insn);
+    void emitPackByteInstruction(const SelectionInstruction &insn);
+    void emitUnpackByteInstruction(const SelectionInstruction &insn);
     void emitDWordGatherInstruction(const SelectionInstruction &insn);
     void emitSampleInstruction(const SelectionInstruction &insn);
     void emitTypedWriteInstruction(const SelectionInstruction &insn);
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 13cbd41..9eb04de 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -30,6 +30,8 @@ DECL_GEN7_SCHEDULE(UntypedWrite,    80,        1,        1)
 DECL_GEN7_SCHEDULE(ByteGather,      80,        1,        1)
 DECL_GEN7_SCHEDULE(ByteScatter,     80,        1,        1)
 DECL_GEN7_SCHEDULE(DWordGather,     80,        1,        1)
+DECL_GEN7_SCHEDULE(PackByte,        20,        1,        1)
+DECL_GEN7_SCHEDULE(UnpackByte,      20,        1,        1)
 DECL_GEN7_SCHEDULE(Sample,          80,        1,        1)
 DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
 DECL_GEN7_SCHEDULE(SpillReg,        80,        1,        1)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 46f0123..48b3069 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -529,6 +529,10 @@ namespace gbe
     void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti);
     /*! DWord scatter (for constant cache read) */
     void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
+    /*! Unpack the uint to char4 */
+    void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum);
+    /*! pack the char4 to uint */
+    void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum);
     /*! Extended math function (2 arguments) */
     void MATH(Reg dst, uint32_t function, Reg src0, Reg src1);
     /*! Extended math function (1 argument) */
@@ -1114,6 +1118,18 @@ namespace gbe
     insn->dst(0) = dst;
     insn->extra.function = bti;
   }
+  void Selection::Opaque::UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNPACK_BYTE, elemNum, 1);
+    insn->src(0) = src;
+    for(uint32_t i = 0; i < elemNum; i++)
+      insn->dst(i) = dst[i];
+  }
+  void Selection::Opaque::PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_PACK_BYTE, 1, elemNum);
+    for(uint32_t i = 0; i < elemNum; i++)
+      insn->src(i) = src[i];
+    insn->dst(0) = dst;
+  }
 
   void Selection::Opaque::MATH(Reg dst, uint32_t function, Reg src0, Reg src1) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_MATH, 1, 2);
@@ -2415,26 +2431,50 @@ namespace gbe
                         const ir::LoadInstruction &insn,
                         const uint32_t elemSize,
                         GenRegister address,
-                        GenRegister value,
                         uint32_t bti) const
     {
       using namespace ir;
-      GBE_ASSERT(insn.getValueNum() == 1);
+      const uint32_t valueNum = insn.getValueNum();
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      if(valueNum > 1) {
+        vector<GenRegister> dst(valueNum);
+        const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
+
+        if(elemSize == GEN_BYTE_SCATTER_WORD) {
+          for(uint32_t i = 0; i < valueNum; i++)
+            dst[i] = sel.selReg(insn.getValue(i), ir::TYPE_U16);
+        } else if(elemSize == GEN_BYTE_SCATTER_BYTE) {
+          for(uint32_t i = 0; i < valueNum; i++)
+            dst[i] = sel.selReg(insn.getValue(i), ir::TYPE_U8);
+        }
 
-      // We need a temporary register if we read bytes or words
-      Register dst = Register(value.value.reg);
-      if (elemSize == GEN_BYTE_SCATTER_WORD ||
-          elemSize == GEN_BYTE_SCATTER_BYTE) {
-        dst = sel.reg(FAMILY_DWORD);
-        sel.BYTE_GATHER(GenRegister::fxgrf(simdWidth, dst), address, elemSize, bti);
-      }
+        uint32_t tmpRegNum = typeSize*valueNum / 4;
+        vector<GenRegister> tmp(tmpRegNum);
+        for(uint32_t i = 0; i < tmpRegNum; i++) {
+          tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+        }
 
-      // Repack bytes or words using a converting mov instruction
-      if (elemSize == GEN_BYTE_SCATTER_WORD)
-        sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst));
-      else if (elemSize == GEN_BYTE_SCATTER_BYTE)
-        sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst));
+        sel.UNTYPED_READ(address, tmp.data(), tmpRegNum, bti);
+        for(uint32_t i = 0; i < tmpRegNum; i++) {
+          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
+        }
+      } else {
+        GBE_ASSERT(insn.getValueNum() == 1);
+        const GenRegister value = sel.selReg(insn.getValue(0));
+        // We need a temporary register if we read bytes or words
+        Register dst = Register(value.value.reg);
+        if (elemSize == GEN_BYTE_SCATTER_WORD ||
+            elemSize == GEN_BYTE_SCATTER_BYTE) {
+          dst = sel.reg(FAMILY_DWORD);
+          sel.BYTE_GATHER(GenRegister::fxgrf(simdWidth, dst), address, elemSize, bti);
+        }
+
+        // Repack bytes or words using a converting mov instruction
+        if (elemSize == GEN_BYTE_SCATTER_WORD)
+          sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst));
+        else if (elemSize == GEN_BYTE_SCATTER_BYTE)
+          sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst));
+      }
     }
 
     void emitIndirectMove(Selection::Opaque &sel,
@@ -2469,8 +2509,7 @@ namespace gbe
         else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
           this->emitDWordGather(sel, insn, address, 0x2);
         else {
-          const GenRegister value = sel.selReg(insn.getValue(0));
-          this->emitByteGather(sel, insn, elemSize, address, value, 0x2);
+          this->emitByteGather(sel, insn, elemSize, address, 0x2);
         }
       }
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
@@ -2478,8 +2517,7 @@ namespace gbe
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
         this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
       else {
-        const GenRegister value = sel.selReg(insn.getValue(0));
-        this->emitByteGather(sel, insn, elemSize, address, value, space == MEM_LOCAL ? 0xfe : 0x01);
+        this->emitByteGather(sel, insn, elemSize, address, space == MEM_LOCAL ? 0xfe : 0x01);
       }
       return true;
     }
@@ -2535,22 +2573,43 @@ namespace gbe
                          const ir::StoreInstruction &insn,
                          const uint32_t elemSize,
                          GenRegister addr,
-                         GenRegister value,
                          uint32_t bti) const
     {
       using namespace ir;
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
-      const GenRegister dst = value;
-
-      GBE_ASSERT(insn.getValueNum() == 1);
-      if (elemSize == GEN_BYTE_SCATTER_WORD) {
-        value = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
-        sel.MOV(value, GenRegister::retype(dst, GEN_TYPE_UW));
-      } else if (elemSize == GEN_BYTE_SCATTER_BYTE) {
-        value = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
-        sel.MOV(value, GenRegister::retype(dst, GEN_TYPE_UB));
+      uint32_t valueNum = insn.getValueNum();
+
+      if(valueNum > 1) {
+        const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
+        vector<GenRegister> value(valueNum);
+
+        if(elemSize == GEN_BYTE_SCATTER_WORD) {
+          for(uint32_t i = 0; i < valueNum; i++)
+            value[i] = sel.selReg(insn.getValue(i), ir::TYPE_U16);
+        } else if(elemSize == GEN_BYTE_SCATTER_BYTE) {
+          for(uint32_t i = 0; i < valueNum; i++)
+            value[i] = sel.selReg(insn.getValue(i), ir::TYPE_U8);
+        }
+
+        uint32_t tmpRegNum = typeSize*valueNum / 4;
+        vector<GenRegister> tmp(tmpRegNum);
+        for(uint32_t i = 0; i < tmpRegNum; i++) {
+          tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+          sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, 4/typeSize);
+        }
+
+        sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti);
+      } else {
+        const GenRegister value = sel.selReg(insn.getValue(0));
+        GBE_ASSERT(insn.getValueNum() == 1);
+        const GenRegister tmp = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+        if (elemSize == GEN_BYTE_SCATTER_WORD) {
+          sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW));
+        } else if (elemSize == GEN_BYTE_SCATTER_BYTE) {
+          sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
+        }
+        sel.BYTE_SCATTER(addr, tmp, elemSize, bti);
       }
-      sel.BYTE_SCATTER(addr, value, elemSize, bti);
     }
 
     INLINE bool emitOne(Selection::Opaque &sel, const ir::StoreInstruction &insn) const
@@ -2566,8 +2625,7 @@ namespace gbe
         this->emitUntypedWrite(sel, insn, bti);
       else {
         const GenRegister address = sel.selReg(insn.getAddress());
-        const GenRegister value = sel.selReg(insn.getValue(0));
-        this->emitByteScatter(sel, insn, elemSize, address, value, bti);
+        this->emitByteScatter(sel, insn, elemSize, address, bti);
       }
       return true;
     }
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index e44b9d4..564dbc5 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -58,6 +58,8 @@ DECL_SELECTION_IR(WRITE64, Write64Instruction)
 DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
 DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
 DECL_SELECTION_IR(DWORD_GATHER, DWordGatherInstruction)
+DECL_SELECTION_IR(PACK_BYTE, PackByteInstruction)
+DECL_SELECTION_IR(UNPACK_BYTE, UnpackByteInstruction)
 DECL_SELECTION_IR(SAMPLE, SampleInstruction)
 DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
 DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 8794318..1158316 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -255,6 +255,28 @@ namespace gbe
       return r;
     }
 
+    // split a DWORD register into unpacked Byte or Short register
+    static INLINE GenRegister splitReg(GenRegister reg, uint32_t count, uint32_t sub_part) {
+      GenRegister r = reg;
+      GBE_ASSERT(count == 4 || count == 2);
+      if(reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+        r.hstride = count == 4 ? GEN_HORIZONTAL_STRIDE_4 : GEN_HORIZONTAL_STRIDE_2;
+      }
+      if(count == 4) {
+        r.type = reg.type == GEN_TYPE_UD ? GEN_TYPE_UB : GEN_TYPE_B;
+        r.vstride = GEN_VERTICAL_STRIDE_32;
+      } else {
+        r.type = reg.type == GEN_TYPE_UD ? GEN_TYPE_UW : GEN_TYPE_W;
+        r.vstride = GEN_VERTICAL_STRIDE_16;
+      }
+
+      r.subnr += sub_part*typeSize(r.type);
+      r.nr += r.subnr / 32;
+      r.subnr %= 32;
+
+      return r;
+    }
+
     INLINE bool isint64(void) const {
       if ((type == GEN_TYPE_UL || type == GEN_TYPE_L) && file == GEN_GENERAL_REGISTER_FILE)
         return true;
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index f9f5e68..457b5b4 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -178,7 +178,8 @@ namespace ir {
     template <typename T> INLINE bool isMemberOf(void) const {
       return T::isClassOf(*this);
     }
-    static const uint32_t MAX_SRC_NUM = 16;
+    /*! max_src for store instruction (vec16 + addr) */
+    static const uint32_t MAX_SRC_NUM = 17;
     static const uint32_t MAX_DST_NUM = 16;
   protected:
     BasicBlock *parent;      //!< The basic block containing the instruction
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 4b692e2..dcc1497 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -578,7 +578,10 @@ namespace gbe
     void visitInsertValueInst(InsertValueInst &I) {NOT_SUPPORTED;}
     void visitExtractValueInst(ExtractValueInst &I) {NOT_SUPPORTED;}
     template <bool isLoad, typename T> void visitLoadOrStore(T &I);
-
+    // batch vec4/8/16 load/store
+    INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
+                  Value *llvmValue, const ir::Register ptr,
+                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad);
     void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
   };
 
@@ -2774,6 +2777,61 @@ namespace gbe
   }
   void GenWriter::regAllocateStoreInst(StoreInst &I) {}
 
+  void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
+                                      Value *llvmValues, const ir::Register ptr,
+                                      const ir::AddressSpace addrSpace,
+                                      Type * elemType, bool isLoad) {
+    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+    uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
+    uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
+    const uint32_t perMsgNum = elemNum / msgNum;
+
+    for (uint32_t msg = 0; msg < msgNum; ++msg) {
+      // Build the tuple data in the vector
+      vector<ir::Register> tupleData; // put registers here
+      for (uint32_t elemID = 0; elemID < perMsgNum; ++elemID) {
+        ir::Register reg;
+        if(regTranslator.isUndefConst(llvmValues, elemID)) {
+          Value *v = Constant::getNullValue(elemType);
+          reg = this->getRegister(v);
+        } else
+          reg = this->getRegister(llvmValues, perMsgNum*msg+elemID);
+
+        tupleData.push_back(reg);
+      }
+      const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], perMsgNum);
+
+      // We may need to update to offset the pointer
+      ir::Register addr;
+      if (msg == 0)
+        addr = ptr;
+      else {
+        const ir::Register offset = ctx.reg(pointerFamily);
+        ir::ImmediateIndex immIndex;
+        ir::Type immType;
+        // each message can read/write 16 byte
+        const int32_t stride = 16;
+        if (pointerFamily == ir::FAMILY_DWORD) {
+          immIndex = ctx.newImmediate(int32_t(msg*stride));
+          immType = ir::TYPE_S32;
+        } else {
+          immIndex = ctx.newImmediate(int64_t(msg*stride));
+          immType = ir::TYPE_S64;
+        }
+
+        addr = ctx.reg(pointerFamily);
+        ctx.LOADI(immType, offset, immIndex);
+        ctx.ADD(immType, addr, ptr, offset);
+      }
+
+      // Emit the instruction
+      if (isLoad)
+        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true);
+      else
+        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true);
+    }
+  }
+
   extern int OCL_SIMD_WIDTH;
   template <bool isLoad, typename T>
   INLINE void GenWriter::emitLoadOrStore(T &I)
@@ -2811,12 +2869,14 @@ namespace gbe
       // count here.
       if (elemNum == 4 && regTranslator.isUndefConst(llvmValues, 3))
           elemNum = 3;
+
       // The code is going to be fairly different from types to types (based on
       // size of each vector element)
       const ir::Type type = getType(ctx, elemType);
       const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+      const ir::RegisterFamily dataFamily = getFamily(type);
 
-      if ((type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) && addrSpace != ir::MEM_CONSTANT) {
+      if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT) {
         // One message is enough here. Nothing special to do
         if (elemNum <= 4) {
           // Build the tuple data in the vector
@@ -2842,51 +2902,11 @@ namespace gbe
         // Not supported by the hardware. So, we split the message and we use
         // strided loads and stores
         else {
-          // We simply use several uint4 loads
-          const uint32_t msgNum = elemNum / 4;
-          for (uint32_t msg = 0; msg < msgNum; ++msg) {
-            // Build the tuple data in the vector
-            vector<ir::Register> tupleData; // put registers here
-            for (uint32_t elemID = 0; elemID < 4; ++elemID) {
-              ir::Register reg;
-              if(regTranslator.isUndefConst(llvmValues, elemID)) {
-                Value *v = Constant::getNullValue(elemType);
-                reg = this->getRegister(v);
-              } else
-                reg = this->getRegister(llvmValues, 4*msg+elemID);
-
-              tupleData.push_back(reg);
-            }
-            const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], 4);
-
-            // We may need to update to offset the pointer
-            ir::Register addr;
-            if (msg == 0)
-              addr = ptr;
-            else {
-              const ir::Register offset = ctx.reg(pointerFamily);
-              ir::ImmediateIndex immIndex;
-              ir::Type immType;
-              if (pointerFamily == ir::FAMILY_DWORD) {
-                immIndex = ctx.newImmediate(int32_t(msg*sizeof(uint32_t[4])));
-                immType = ir::TYPE_S32;
-              } else {
-                immIndex = ctx.newImmediate(int64_t(msg*sizeof(uint64_t[4])));
-                immType = ir::TYPE_S64;
-              }
-
-              addr = ctx.reg(pointerFamily);
-              ctx.LOADI(immType, offset, immIndex);
-              ctx.ADD(immType, addr, ptr, offset);
-            }
-
-            // Emit the instruction
-            if (isLoad)
-              ctx.LOAD(type, tuple, addr, addrSpace, 4, true);
-            else
-              ctx.STORE(type, tuple, addr, addrSpace, 4, true);
-          }
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad);
         }
+      }
+      else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) {
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad);
       } else {
         for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
           if(regTranslator.isUndefConst(llvmValues, elemID))
-- 
1.7.9.5



More information about the Beignet mailing list