[Beignet] [PATCH 1/2] GBE: relax the batch byte/short load vector size restrication.

Zhigang Gong zhigang.gong at intel.com
Wed Aug 27 19:46:03 PDT 2014


Previous restrication is that the vector size must be multiple
of DWORD. This restrication prevent the vload2/3 of char or
vload3 of ushort to be optimized. This patch relax this restrication
on the vload path.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/gen_context.cpp              |  6 ++--
 backend/src/backend/gen_insn_selection.cpp       | 39 +++++++++++-------------
 backend/src/llvm/llvm_gen_backend.cpp            |  3 +-
 backend/src/llvm/llvm_loadstore_optimization.cpp |  3 +-
 4 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index ba4a8f8..883fa39 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1693,7 +1693,7 @@ namespace gbe
   void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
     for(uint32_t i = 0; i < insn.dstNum; i++) {
-      p->MOV(ra->genReg(insn.dst(i)), GenRegister::splitReg(src, insn.dstNum, i));
+      p->MOV(ra->genReg(insn.dst(i)), GenRegister::splitReg(src, insn.extra.elem, i));
     }
   }
 
@@ -1702,12 +1702,12 @@ namespace gbe
     p->push();
     if(simdWidth == 8) {
       for(uint32_t i = 0; i < insn.srcNum; i++)
-        p->MOV(GenRegister::splitReg(dst, insn.srcNum, i), ra->genReg(insn.src(i)));
+        p->MOV(GenRegister::splitReg(dst, insn.extra.elem, i), ra->genReg(insn.src(i)));
     } else {
       // when destination expands two registers, the source must span two registers.
       p->curr.execWidth = 8;
       for(uint32_t i = 0; i < insn.srcNum; i++) {
-        GenRegister dsti = GenRegister::splitReg(dst, insn.srcNum, i);
+        GenRegister dsti = GenRegister::splitReg(dst, insn.extra.elem, i);
         GenRegister src = ra->genReg(insn.src(i));
 
         p->curr.quarterControl = 0;
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 8478616..1258e54 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -575,10 +575,10 @@ namespace gbe
     void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti);
     /*! DWord scatter (for constant cache read) */
     void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
-    /*! Unpack the uint to char4 */
-    void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum);
-    /*! pack the char4 to uint */
-    void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum);
+    /*! Unpack the uint to charN */
+    void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemSize, uint32_t elemNum);
+    /*! pack the charN to uint */
+    void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemSize, uint32_t elemNum);
     /*! Extended math function (2 arguments) */
     void MATH(Reg dst, uint32_t function, Reg src0, Reg src1);
     /*! Extended math function (1 argument) */
@@ -1255,16 +1255,18 @@ namespace gbe
     srcVector->reg = &insn->src(0);
   }
 
-  void Selection::Opaque::UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum) {
+  void Selection::Opaque::UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemSize, uint32_t elemNum) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_UNPACK_BYTE, elemNum, 1);
     insn->src(0) = src;
+    insn->extra.elem = 4 / elemSize;
     for(uint32_t i = 0; i < elemNum; i++)
       insn->dst(i) = dst[i];
   }
-  void Selection::Opaque::PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum) {
+  void Selection::Opaque::PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemSize, uint32_t elemNum) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_PACK_BYTE, 1, elemNum);
     for(uint32_t i = 0; i < elemNum; i++)
       insn->src(i) = src[i];
+    insn->extra.elem = 4 / elemSize;
     insn->dst(0) = dst;
   }
 
@@ -2862,9 +2864,7 @@ namespace gbe
       for(uint32_t i = 0; i < valueNum; i++)
         dst[i] = sel.selReg(insn.getValue(i), getType(family));
 
-      uint32_t tmpRegNum = typeSize*valueNum / 4;
-      if (tmpRegNum == 0)
-        tmpRegNum = 1;
+      uint32_t tmpRegNum = (typeSize*valueNum + 3) / 4;
       vector<GenRegister> tmp(tmpRegNum);
       vector<GenRegister> tmp2(tmpRegNum);
       vector<Register> tmpReg(tmpRegNum);
@@ -2875,15 +2875,10 @@ namespace gbe
 
       readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
 
-      if (valueNum > 1) {
-        for(uint32_t i = 0; i < tmpRegNum; i++)
-          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
-      }
-      else {
-        if (elemSize == GEN_BYTE_SCATTER_WORD)
-          sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UW), sel.unpacked_uw(tmpReg[0]));
-        else if (elemSize == GEN_BYTE_SCATTER_BYTE)
-          sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UB), sel.unpacked_ub(tmpReg[0]));
+      for(uint32_t i = 0; i < tmpRegNum; i++) {
+        unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
+                               4/typeSize : (valueNum - i * (4 / typeSize));
+        sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], typeSize, elemNum);
       }
     }
 
@@ -2948,7 +2943,7 @@ namespace gbe
         for(uint32_t i = 0; i < valueNum; i++)
           dst[i] = sel.selReg(insn.getValue(i), getType(family));
 
-        uint32_t effectDataNum = typeSize*valueNum / 4;
+        uint32_t effectDataNum = (typeSize*valueNum + 3) / 4;
         vector<GenRegister> tmp(effectDataNum + 1);
         vector<GenRegister> tmp2(effectDataNum + 1);
         vector<GenRegister> effectData(effectDataNum);
@@ -2986,7 +2981,9 @@ namespace gbe
         getEffectByteData(sel, effectData, tmp, effectDataNum, address, simdWidth);
 
         for(uint32_t i = 0; i < effectDataNum; i++) {
-          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], 4/typeSize);
+          unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
+                                 4/typeSize : (valueNum - i * (4 / typeSize));
+          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], typeSize, elemNum);
         }
       } else {
         GBE_ASSERT(insn.getValueNum() == 1);
@@ -3148,7 +3145,7 @@ namespace gbe
         vector<GenRegister> tmp(tmpRegNum);
         for(uint32_t i = 0; i < tmpRegNum; i++) {
           tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
-          sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, 4/typeSize);
+          sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, typeSize, 4/typeSize);
         }
 
         sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index b956bc6..8f0d5c2 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -3515,7 +3515,8 @@ handle_write_image:
           emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
         }
       }
-      else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) {
+      else if((dataFamily == ir::FAMILY_WORD && (isLoad || elemNum % 2 == 0)) ||
+              (dataFamily == ir::FAMILY_BYTE && (isLoad || elemNum % 4 == 0))) {
           emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
       } else {
         for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
index 19726b0..ae91af7 100644
--- a/backend/src/llvm/llvm_loadstore_optimization.cpp
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -259,8 +259,7 @@ namespace gbe {
         while(size > 1) {
           unsigned vecSize = (size >= 16) ? 16 :
                              (size >= 8 ? 8 :
-                             (size >= 4 ? 4 :
-                             (size >= 2 ? 2 : size)));
+                             (size >= 4 ? 4 : size));
           SmallVector<Instruction*, 16> mergedVec(merged.begin() + pos, merged.begin() + pos + vecSize);
           if(isLoad)
             mergeLoad(BB, mergedVec);
-- 
1.8.3.2



More information about the Beignet mailing list