[Beignet] [PATCH] Backend: Refine block read/write instruction selection

Xiuli Pan xiuli.pan at intel.com
Fri Dec 9 07:01:23 UTC 2016


From: Pan Xiuli <xiuli.pan at intel.com>

Move the block pack/unpack into instruction selection in order to get
optimization.

Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 backend/src/backend/gen_context.cpp        | 459 ++---------------------------
 backend/src/backend/gen_insn_selection.cpp | 439 ++++++++++++++++++++-------
 2 files changed, 346 insertions(+), 552 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 798fac8..4e971a2 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3551,458 +3551,39 @@ namespace gbe
   }
 
   void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
-    const GenRegister dst= ra->genReg(insn.dst(1));
-    const GenRegister addrreg = ra->genReg(insn.src(0));
-    uint32_t type = dst.type;
-    uint32_t typesize = typeSize(type);
-    const uint32_t vec_size = insn.extra.elem;
-    const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), type);
-    const uint32_t simdWidth = p->curr.execWidth;
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
-    GenRegister headeraddr;
-    bool isA64 = insn.getbti() == 255;
+    const GenRegister header = ra->genReg(insn.src(0));
+    const GenRegister tmp = ra->genReg(insn.dst(0));
+    const uint32_t bti = insn.getbti();
+    const uint32_t ow_size = insn.extra.elem;
+    bool isA64 = bti == 255;
     if (isA64)
-      headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL);
+       p->OBREADA64(tmp, header, bti, ow_size);
     else
-      headeraddr = GenRegister::offset(header, 0, 2*4);
-
-    // Make header
-    p->push();
-    {
-      // Copy r0 into the header first
-      p->curr.execWidth = 8;
-      p->curr.predicate = GEN_PREDICATE_NONE;
-      p->curr.noMask = 1;
-      p->MOV(header, GenRegister::ud8grf(0, 0));
-
-      // Update the header with the current address
-      p->curr.execWidth = 1;
-      p->MOV(headeraddr, addr);
-
-      // Put zero in the general state base address
-      if (!isA64)
-        p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
-
-    }
-    p->pop();
-    // Now read the data, oword block read can only work with simd16 and no mask
-    if (vec_size == 1) {
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        if (isA64) {
-          //p->curr.execWidth = 8;
-          p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / 16);
-        }
-        else
-          p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
-      }
-      p->pop();
-    } else if (vec_size == 2) {
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        if (isA64)
-          p->OBREADA64(tmp, header, insn.getbti(), simdWidth * typesize / 8);
-        else
-          p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
-      }
-      p->pop();
-      p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
-      p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, simdWidth * typesize ));
-    } else if (vec_size == 4) {
-      if (simdWidth == 8) {
-        p->push();
-        {
-          p->curr.execWidth = 16;
-          p->curr.noMask = 1;
-          if (isA64)
-            p->OBREADA64(tmp, header, insn.getbti(), 2 * typesize);
-          else
-            p->OBREAD(tmp, header, insn.getbti(), 2 * typesize);
-        }
-        p->pop();
-        for (uint32_t j = 0; j < 4; j++)
-          p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j * simdWidth * typesize ));
-      } else {
-        for (uint32_t i = 0; i < typesize / 2; i++) {
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
-            }
-            p->pop();
-          }
-          if (isA64)
-            p->OBREADA64(tmp, header, insn.getbti(), 8);
-          else
-            p->OBREAD(tmp, header, insn.getbti(), 8);
-          for (uint32_t j = 0; j < 8 / typesize ; j++)
-            p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
-        }
-      }
-    } else if (vec_size == 8) {
-      if (simdWidth == 8) {
-        for (uint32_t i = 0; i < typesize / 2; i++) {
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
-            }
-            p->pop();
-          }
-          p->push();
-          {
-            p->curr.execWidth = 16;
-            p->curr.noMask = 1;
-            if (isA64)
-              p->OBREADA64(tmp, header, insn.getbti(), 8);
-            else
-              p->OBREAD(tmp, header, insn.getbti(), 8);
-          }
-          p->pop();
-          for (uint32_t j = 0; j < 16 / typesize; j++)
-            p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, 0, j * simdWidth * typesize ));
-        }
-      } else {
-        for (uint32_t i = 0; i < typesize ; i++) {
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
-            }
-            p->pop();
-          }
-          if (isA64)
-            p->OBREADA64(tmp, header, insn.getbti(), 8);
-          else
-            p->OBREAD(tmp, header, insn.getbti(), 8);
-          for (uint32_t j = 0; j < 8 / typesize; j++)
-            p->MOV(ra->genReg(insn.dst(1 + j + i * 8 / typesize)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
-        }
-      }
-    } else NOT_SUPPORTED;
+       p->OBREAD(tmp, header, bti, ow_size);
   }
 
   void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) {
-    const GenRegister addrreg = ra->genReg(insn.src(0));
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    uint32_t type = ra->genReg(insn.src(1)).type;
-    uint32_t typesize = typeSize(type);
-    const uint32_t vec_size = insn.extra.elem;
-    const GenRegister tmp = GenRegister::offset(header, 1);
-    const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
-    GenRegister headeraddr;
-    bool isA64 = insn.getbti() == 255;
+    const GenRegister header = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.getbti();
+    const uint32_t ow_size = insn.extra.elem;
+    bool isA64 = bti == 255;
     if (isA64)
-      headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL);
+       p->OBWRITEA64(header, bti, ow_size);
     else
-      headeraddr = GenRegister::offset(header, 0, 2*4);
-    const uint32_t simdWidth = p->curr.execWidth;
-    uint32_t tmp_size = simdWidth * vec_size / 8;
-    tmp_size = tmp_size > 4 ? 4 : tmp_size;
-    uint32_t offset_size = isA64 ? 128 : 8;
-
-    p->push();
-      // Copy r0 into the header first
-      p->curr.execWidth = 8;
-      p->curr.predicate = GEN_PREDICATE_NONE;
-      p->curr.noMask = 1;
-      p->MOV(header, GenRegister::ud8grf(0,0));
-
-      // Update the header with the current address
-      p->curr.execWidth = 1;
-      if (isA64)
-        p->MOV(headeraddr, addr);
-      else
-        p->SHR(headeraddr, addr, GenRegister::immud(4));
-
-      // Put zero in the general state base address
-      if (!isA64)
-        p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
-
-    p->pop();
-    // Now write the data, oword block write can only work with simd16 and no mask
-    if (vec_size == 1) {
-      p->MOV(GenRegister::retype(tmp, type), ra->genReg(insn.src(1)));
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        if (isA64)
-          p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 16);
-        else
-          p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16);
-      }
-      p->pop();
-    } else if (vec_size == 2) {
-      p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, 0), type), ra->genReg(insn.src(1)));
-      p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, simdWidth * typesize), type), ra->genReg(insn.src(2)));
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        if (isA64)
-          p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 8);
-        else
-          p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8);
-      }
-      p->pop();
-    } else if (vec_size == 4) {
-      if (simdWidth == 8) {
-        for (uint32_t i = 0; i < 4; i++)
-          p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * simdWidth * typesize), type), ra->genReg(insn.src(1 + i)));
-        p->push();
-        {
-          p->curr.execWidth = 16;
-          p->curr.noMask = 1;
-          if (isA64)
-            p->OBWRITEA64(header, insn.getbti(), 2 * typesize);
-          else
-            p->OBWRITE(header, insn.getbti(), 2 * typesize);
-        }
-        p->pop();
-      } else {
-        for (uint32_t i = 0; i < typesize / 2; i++) {
-          for (uint32_t j = 0; j < 8 / typesize; j++)
-            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
-            }
-            p->pop();
-          }
-          if (isA64)
-            p->OBWRITEA64(header, insn.getbti(), 8);
-          else
-            p->OBWRITE(header, insn.getbti(), 8);
-        }
-      }
-    } else if (vec_size == 8) {
-      if (simdWidth == 8) {
-        for (uint32_t i = 0; i < typesize / 2; i++) {
-          for (uint32_t j = 0; j < 16 / typesize; j++)
-            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 16 / typesize)));
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
-            }
-            p->pop();
-          }
-          p->push();
-          {
-            p->curr.execWidth = 16;
-            p->curr.noMask = 1;
-            if (isA64)
-              p->OBWRITEA64(header, insn.getbti(), 8);
-            else
-              p->OBWRITE(header, insn.getbti(), 8);
-          }
-          p->pop();
-        }
-      } else {
-        for (uint32_t i = 0; i < typesize; i++) {
-          for (uint32_t j = 0; j < 8 / typesize; j++)
-            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
-            }
-            p->pop();
-          }
-          if (isA64)
-            p->OBWRITEA64(header, insn.getbti(), 8);
-          else
-            p->OBWRITE(header, insn.getbti(), 8);
-        }
-      }
-    } else NOT_SUPPORTED;
-
+       p->OBWRITE(header, bti, ow_size);
   }
 
   void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) {
-    const GenRegister dst = ra->genReg(insn.dst(1));
-    const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)),GEN_TYPE_D);
-    const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)),GEN_TYPE_D);
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    const GenRegister offsetx = GenRegister::offset(header, 0, 0*4);
-    const GenRegister offsety = GenRegister::offset(header, 0, 1*4);
-    const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4);
-    size_t vec_size = insn.extra.elem;
-    uint32_t type = dst.type;
-    uint32_t typesize = typeSize(type);
-    uint32_t block_width = typesize * simdWidth;
-    uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
-
-    if (simdWidth == 8)
-    {
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        p->curr.execWidth = 8;
-        // ushort in simd8 will have half reg, but response lenght is still 1
-        uint32_t rsize = vec_size * typesize / 4;
-        rsize = rsize ? rsize : 1;
-        // Now read the data
-        p->MBREAD(dst, header, insn.getbti(), rsize);
-      p->pop();
-
-    }
-    else if (simdWidth == 16)
-    {
-      const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(vec_size + 1)), GEN_TYPE_UD);
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-
-        // First half
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        // Now read the data
-        p->curr.execWidth = 8;
-        p->MBREAD(tmp, header, insn.getbti(), vec_size);
-        for (uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::retype(ra->genReg(insn.dst(i + 1)),GEN_TYPE_UD), GenRegister::offset(tmp, i));
-
-        if (typesize == 4)
-        {
-          // Second half
-          // Update the header with the coord
-          p->curr.execWidth = 1;
-          p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
-          // Now read the data
-          p->curr.execWidth = 8;
-          p->MBREAD(tmp, header, insn.getbti(), vec_size);
-
-          // Move the reg to fit vector rule.
-          for (uint32_t i = 0; i < vec_size; i++)
-            p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1),
-                   GenRegister::offset(tmp, i));
-        }
-      p->pop();
-    } else NOT_IMPLEMENTED;
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister header = ra->genReg(insn.src(0));
+    const size_t rsize = insn.extra.elem;
+    p->MBREAD(dst, header, insn.getbti(), rsize);
   }
 
   void GenContext::emitMBWriteInstruction(const SelectionInstruction &insn) {
-    const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_D);
-    const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), GEN_TYPE_D);
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    const GenRegister tmp = GenRegister::offset(header, 1);
-    GenRegister offsetx, offsety, blocksizereg;
-    size_t vec_size = insn.extra.elem;
-    uint32_t type = ra->genReg(insn.src(2)).type;
-    uint32_t typesize = typeSize(type);
-    uint32_t block_width = typesize * simdWidth;
-    uint32_t blocksize = (block_width - 1) % 32 | (vec_size-1) << 16;
-
-    offsetx = GenRegister::offset(header, 0, 0*4);
-    offsety = GenRegister::offset(header, 0, 1*4);
-    blocksizereg = GenRegister::offset(header, 0, 2*4);
-
-    if (simdWidth == 8)
-    {
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        p->curr.execWidth = 8;
-        // Mov what we need into msgs
-        for(uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * block_width), type),
-                 ra->genReg(insn.src(2 + i)));
-        // ushort in simd8 will have half reg, but reponse lenght is still 1
-        uint32_t rsize = vec_size * typesize / 4;
-        rsize = rsize ? rsize : 1;
-        // Now read the data
-        p->MBWRITE(header, insn.getbti(), rsize);
-      p->pop();
-
-    }
-    else
-    {
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-
-        // First half
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        // Now read the data
-        p->curr.execWidth = 8;
-        // Mov what we need into msgs
-        for(uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::offset(tmp, i), GenRegister::retype(ra->genReg(insn.src(2 + i)), GEN_TYPE_UD));
-        p->MBWRITE(header, insn.getbti(), vec_size);
-
-        if (typesize == 4)
-        {
-          // Second half
-          // Update the header with the coord
-          p->curr.execWidth = 1;
-          p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
-          p->curr.execWidth = 8;
-          // Mov what we need into msgs
-          for(uint32_t i = 0; i < vec_size; i++)
-            p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1));
-          // Now write the data
-          p->MBWRITE(header, insn.getbti(), vec_size);
-        }
-
-      p->pop();
-    }
+    const GenRegister header = ra->genReg(insn.dst(0));
+    const size_t msgsize = insn.extra.elem;
+    p->MBWRITE(header, insn.getbti(), msgsize);
   }
 
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 7fc22fa..a869986 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -728,13 +728,13 @@ namespace gbe
     void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
                       GenRegister tmpData1, GenRegister tmpData2);
     /*! Oblock read */
-    void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+    void OBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t ow_size);
     /*! Oblock write */
-    void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+    void OBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t ow_size);
     /*! Media block read */
-    void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+    void MBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t block_size);
     /*! Media block write */
-    void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister* values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+    void MBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t block_size);
 
     /* common functions for both binary instruction and sel_cmp and compare instruction.
        It will handle the IMM or normal register assignment, and will try to avoid LOADI
@@ -2236,118 +2236,84 @@ namespace gbe
   }
   void Selection::Opaque::OBREAD(GenRegister* dsts,
                                  uint32_t vec_size,
-                                 GenRegister addr,
                                  GenRegister header,
                                  uint32_t bti,
-                                 GenRegister* tmp,
-                                 uint32_t tmp_size) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1 + vec_size + tmp_size, 1);
+                                 uint32_t ow_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, vec_size, 1);
     SelectionVector *vector = this->appendVector();
-    insn->dst(0) = header;
+    insn->src(0) = header;
     for (uint32_t i = 0; i < vec_size; ++i)
-      insn->dst(1 + i) = dsts[i];
-    for (uint32_t i = 0; i < tmp_size; ++i)
-      insn->dst(1 + i + vec_size) = tmp[i];
-    insn->src(0) = addr;
+      insn->dst(i) = dsts[i];
     insn->setbti(bti);
-    insn->extra.elem = vec_size; // number of vector size
+    insn->extra.elem = ow_size; // number of OWord size
 
     // tmp regs for OWORD read dst
-    vector->regNum = tmp_size;
-    vector->reg = &insn->dst(1 + vec_size);
-    vector->offsetID = 1 + vec_size;
+    vector->regNum = vec_size;
+    vector->reg = &insn->dst(0);
+    vector->offsetID = 0;
     vector->isSrc = 0;
   }
 
-  void Selection::Opaque::OBWRITE(GenRegister addr,
+  void Selection::Opaque::OBWRITE(GenRegister header,
                                   GenRegister* values,
                                   uint32_t vec_size,
-                                  GenRegister header,
                                   uint32_t bti,
-                                  GenRegister* tmp,
-                                  uint32_t tmp_size) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, tmp_size + 1, vec_size + 1);
+                                  uint32_t ow_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, vec_size + 1);
     SelectionVector *vector = this->appendVector();
-    insn->src(0) = addr;
+    insn->src(0) = header;
     for (uint32_t i = 0; i < vec_size; ++i)
       insn->src(i + 1) = values[i];
-    insn->dst(0) = header;
-    for (uint32_t i = 0; i < tmp_size; ++i)
-      insn->dst(i + 1) = tmp[i];
     insn->setbti(bti);
-    insn->extra.elem = vec_size; // number of vector_size
+    insn->extra.elem = ow_size; // number of OWord_size
 
-    // tmp regs for OWORD read dst
-    vector->regNum = tmp_size + 1;
-    vector->reg = &insn->dst(0);
+    // tmp regs for OWORD write header and values
+    vector->regNum = vec_size + 1;
+    vector->reg = &insn->src(0);
     vector->offsetID = 0;
-    vector->isSrc = 0;
+    vector->isSrc = 1;
+
   }
 
   void Selection::Opaque::MBREAD(GenRegister* dsts,
-                                 GenRegister coordx,
-                                 GenRegister coordy,
+                                 uint32_t tmp_size,
                                  GenRegister header,
-                                 GenRegister* tmp,
                                  uint32_t bti,
-                                 uint32_t vec_size) {
-
-    uint32_t simdWidth = curr.execWidth;
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * simdWidth / 8 + 1, 2);
-    insn->dst(0) = header;
-    for (uint32_t i = 0; i < vec_size; ++i) {
-      insn->dst(i + 1) = dsts[i];
-      if(simdWidth == 16)
-        insn->dst(i + vec_size + 1) = tmp[i];
-    }
-    insn->src(0) = coordx;
-    insn->src(1) = coordy;
+                                 uint32_t block_size) {
+
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, tmp_size, 1);
+    insn->src(0) = header;
     insn->setbti(bti);
-    insn->extra.elem = vec_size; // vector size
+    insn->extra.elem = block_size; // vector size
 
-    // Only in simd 8 the data is in vector form
-    if(simdWidth == 8) {
-      SelectionVector *vector = this->appendVector();
-      vector->regNum = vec_size;
-      vector->reg = &insn->dst(1);
-      vector->offsetID = 1;
-      vector->isSrc = 0;
-    }
-    if(simdWidth == 16)
-    {
-      SelectionVector *vectortmp = this->appendVector();
-      vectortmp->regNum = vec_size;
-      vectortmp->reg = &insn->dst(vec_size + 1);
-      vectortmp->offsetID = vec_size + 1;
-      vectortmp->isSrc = 0;
+    for (uint32_t i = 0; i < tmp_size; ++i) {
+      insn->dst(i) = dsts[i];
     }
+    SelectionVector *vector = this->appendVector();
+    vector->regNum = tmp_size;
+    vector->reg = &insn->dst(0);
+    vector->offsetID = 0;
+    vector->isSrc = 0;
   }
 
-  void Selection::Opaque::MBWRITE(GenRegister coordx,
-                                  GenRegister coordy,
+  void Selection::Opaque::MBWRITE(GenRegister header,
                                   GenRegister* values,
-                                  GenRegister header,
-                                  GenRegister* tmp,
+                                  uint32_t tmp_size,
                                   uint32_t bti,
-                                  uint32_t vec_size) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 + vec_size, 2 + vec_size);
+                                  uint32_t block_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 0, 1 + tmp_size);
     SelectionVector *vector = this->appendVector();
-    insn->src(0) = coordx;
-    insn->src(1) = coordy;
-    for (uint32_t i = 0; i < vec_size; ++i)
-      insn->src(2 + i) = values[i];
-    insn->dst(0) = header;
-    for (uint32_t i = 0; i < vec_size; ++i)
-      insn->dst(1 + i) = tmp[i];
-    insn->state = this->curr;
+    insn->src(0) = header;
+    for (uint32_t i = 0; i < tmp_size; ++i)
+      insn->src(1 + i) = values[i];
     insn->setbti(bti);
-    insn->extra.elem = vec_size; // vector size
+    insn->extra.elem = block_size; // msg size
 
     // We need to put the header and the data together
-    vector->regNum = 1 + vec_size;
+    vector->regNum = 1 + tmp_size;
     vector->reg = &insn->dst(0);
     vector->offsetID = 0;
-    vector->isSrc = 0;
+    vector->isSrc = 1;
   }
 
   // Boiler plate to initialize the selection library at c++ pre-main
@@ -4683,18 +4649,79 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
       const Type type = insn.getValueType();
       const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+      const RegisterFamily family = getFamily(type);
+      bool isA64 = SI == 255;
+
+      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)), GEN_TYPE_UD);
       vector<GenRegister> valuesVec;
+      vector<GenRegister> tmpVec;
       for(uint32_t i = 0; i < vec_size; i++)
         valuesVec.push_back(sel.selReg(insn.getValue(i), type));
-      // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs
-      uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
-      tmp_size = tmp_size == 0 ? 1 : tmp_size;
-      tmp_size = tmp_size > 4 ? 4 : tmp_size;
-      vector<GenRegister> tmpVec;
+
+      GenRegister headeraddr;
+      if (isA64)
+        headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL);
+      else
+        headeraddr = sel.getOffsetReg(header, 0, 2 * 4);
+      // Make header
+      sel.push();
+      {
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+        // Update the header with the current address
+        sel.curr.execWidth = 1;
+
+        // Put zero in the general state base address
+        if (isA64)
+          sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL));
+        else {
+          sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD));
+          sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
+        }
+      }
+      sel.pop();
+
+      /* For block read we need to unpack the block date into values, and for different
+       * simdwidth and vector size with different type size, we may need to spilt the
+       * block read send message.
+       * We can only get a send message with 5 reg length
+       * so for different combination we have different message length and tmp vector size
+       *              |  simd8  | simd16 |  simd8 | simd16
+       *  r0  |header |         |        |        |
+       *  r1  |date   |  w0,w1  |   w0   |   dw0  |  dw0
+       *  r2  |date   |  w2,w3  |   w1   |   dw1  |  dw0
+       *  r3  |date   | ......  | ...... | ...... |  dw1
+       *  r4  |date   | ....... | ...... | ...... |  dw1
+       */
+
+      uint32_t totalSize = simdWidth * typeSize * vec_size;
+      uint32_t valueSize = simdWidth * typeSize;
+      uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
+      uint32_t msg_num = vec_size / tmp_size;
+      uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
+
       for(uint32_t i = 0; i < tmp_size; i++)
-        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
-      sel.OBREAD(&valuesVec[0], vec_size, address, header, SI, &tmpVec[0], tmp_size);
+        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+      for (uint32_t i = 0; i < msg_num; i++) {
+          if (i > 0) {
+            sel.push();
+            {
+              // Update the address in header
+              sel.curr.execWidth = 1;
+              sel.ADD(headeraddr, headeraddr, GenRegister::immud(128));
+            }
+            sel.pop();
+          }
+          sel.OBREAD(&tmpVec[0], tmp_size, header, SI, ow_size);
+          for (uint32_t j = 0; j < tmp_size; j++)
+            sel.MOV(valuesVec[j + i * tmp_size], tmpVec[j]);
+      }
+
     }
 
     // check whether all binded table index point to constant memory
@@ -5129,18 +5156,80 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
       const Type type = insn.getValueType();
       const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+      const RegisterFamily family = getFamily(type);
+      bool isA64 = SI == 255;
+
+      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)), GEN_TYPE_UD);
       vector<GenRegister> valuesVec;
+      vector<GenRegister> tmpVec;
       for(uint32_t i = 0; i < vec_size; i++)
         valuesVec.push_back(sel.selReg(insn.getValue(i), type));
-      // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs
-      uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
-      tmp_size = tmp_size == 0 ? 1 : tmp_size;
-      tmp_size = tmp_size > 4 ? 4 : tmp_size;
-      vector<GenRegister> tmpVec;
+
+      GenRegister headeraddr;
+      if (isA64)
+        headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL);
+      else
+        headeraddr = sel.getOffsetReg(header, 0, 2 * 4);
+      // Make header
+      sel.push();
+      {
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+        // Update the header with the current address
+        sel.curr.execWidth = 1;
+
+        // Put zero in the general state base address
+        if (isA64)
+          sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL));
+        else {
+          sel.SHR(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD), GenRegister::immud(4));
+          sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
+        }
+      }
+      sel.pop();
+
+      /* For block write we need to pack the block date into the tmp, and for different
+       * simdwidth and vector size with different type size, we may need to spilt the
+       * block write send message.
+       * We can only get a send message with 5 reg length
+       * so for different combination we have different message length and tmp vector size
+       *              |  simd8  | simd16 |  simd8 | simd16
+       *  r0  |header |         |        |        |
+       *  r1  |date   |  w0,w1  |   w0   |   dw0  |  dw0
+       *  r2  |date   |  w2,w3  |   w1   |   dw1  |  dw0
+       *  r3  |date   | ......  | ...... | ...... |  dw1
+       *  r4  |date   | ....... | ...... | ...... |  dw1
+       */
+
+      uint32_t totalSize = simdWidth * typeSize * vec_size;
+      uint32_t valueSize = simdWidth * typeSize;
+      uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
+      uint32_t msg_num = vec_size / tmp_size;
+      uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
+
       for(uint32_t i = 0; i < tmp_size; i++)
-        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
-      sel.OBWRITE(address, &valuesVec[0], vec_size, header, SI, &tmpVec[0], tmp_size);
+        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+      for (uint32_t i = 0; i < msg_num; i++) {
+          for (uint32_t j = 0; j < tmp_size; j++)
+            sel.MOV(tmpVec[j], valuesVec[j + i * tmp_size]);
+          if (i > 0) {
+            sel.push();
+            {
+              // Update the address in header
+              sel.curr.execWidth = 1;
+              sel.ADD(headeraddr, headeraddr, GenRegister::immud(8));
+            }
+            sel.pop();
+          }
+          sel.OBWRITE(header, &tmpVec[0], tmp_size, SI, ow_size);
+      }
+
+
     }
 
     virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
@@ -7616,20 +7705,77 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       uint32_t vec_size = insn.getVectorSize();
       uint32_t simdWidth = sel.curr.execWidth;
       const Type type = insn.getType();
+      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+      // ushort in simd8 will have half reg, but response lenght is still 1
+      uint32_t rsize = simdWidth * vec_size * typeSize / 32;
+      rsize = rsize ? rsize : 1;
+      uint32_t block_width = typeSize * simdWidth;
+      uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
+
+
       vector<GenRegister> valuesVec;
       vector<GenRegister> tmpVec;
       for (uint32_t i = 0; i < vec_size; ++i) {
         valuesVec.push_back(sel.selReg(insn.getDst(i), type));
-        if(simdWidth == 16)
-          tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
+        if(simdWidth == 16 && typeSize == 4)
+          tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)), GEN_TYPE_UD));
       }
-      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
-      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
-      GenRegister *tmp = NULL;
-      if(simdWidth == 16)
-        tmp = &tmpVec[0];
-      sel.MBREAD(&valuesVec[0], coordx, coordy, header, tmp, insn.getImageIndex(), insn.getVectorSize());
+      const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)), GEN_TYPE_UD);
+      const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0 * 4), GEN_TYPE_UD);
+      const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1 * 4), GEN_TYPE_UD);
+      const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2 * 4);
+
+      // Make header
+      sel.push();
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+        // Update the header with the coord
+        sel.curr.execWidth = 1;
+        sel.MOV(offsetx, coordx);
+        sel.MOV(offsety, coordy);
+        // Update block width and height
+        sel.MOV(blocksizereg, GenRegister::immud(blocksize));
+      sel.pop();
+
+      if (simdWidth * typeSize < 64) {
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          // Now read the data
+          sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), rsize);
+        sel.pop();
+      } else if (simdWidth * typeSize == 64) {
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          sel.MBREAD(&tmpVec[0], vec_size ,header, insn.getImageIndex(), vec_size);
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(valuesVec[i], tmpVec[i]);
+
+          // Second half
+          // Update the header with the coord
+          sel.curr.execWidth = 1;
+          sel.ADD(offsetx, offsetx, GenRegister::immud(32));
+
+          // Now read the data
+          sel.curr.execWidth = 8;
+          sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(), vec_size);
+
+          // Move the reg to fit vector rule.
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 32) , tmpVec[i]);
+        sel.pop();
+      } else NOT_IMPLEMENTED;
+
+
       return true;
     }
     DECL_CTOR(MediaBlockReadInstruction, 1, 1);
@@ -7643,17 +7789,84 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       using namespace ir;
       uint32_t vec_size = insn.getVectorSize();
       const Type type = insn.getType();
-      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
-      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
+      uint32_t simdWidth = sel.curr.execWidth;
+      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+      const RegisterFamily family = getFamily(type);
+      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+      // ushort in simd8 will have half reg, but response lenght is still 1
+      uint32_t msgsize = simdWidth * vec_size * typeSize / 32;
+      msgsize = msgsize ? msgsize : 1;
+      uint32_t block_width = typeSize * simdWidth;
+      uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
+
+
       vector<GenRegister> valuesVec;
       vector<GenRegister> tmpVec;
-      for(uint32_t i = 0; i < vec_size; i++)
-      {
-        valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
-        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
-      }
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
-      sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0], insn.getImageIndex(), vec_size);
+      for (uint32_t i = 0; i < vec_size; ++i) {
+         valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
+        if(simdWidth == 16 && typeSize == 4)
+          tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)), GEN_TYPE_UD));
+        else
+          tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+       }
+      const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)), GEN_TYPE_UD);
+      const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0*4), GEN_TYPE_UD);
+      const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1*4), GEN_TYPE_UD);
+      const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2*4);
+
+      // Make header
+      sel.push();
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+        // Update the header with the coord
+        sel.curr.execWidth = 1;
+        sel.MOV(offsetx, coordx);
+        sel.MOV(offsety, coordy);
+        // Update block width and height
+        sel.MOV(blocksizereg, GenRegister::immud(blocksize));
+      sel.pop();
+
+      if (simdWidth * typeSize < 64) {
+        for (uint32_t i = 0; i < vec_size; ++i) {
+            sel.MOV(tmpVec[i], valuesVec[i]);
+        }
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          // Now read the data
+          sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), msgsize);
+        sel.pop();
+      } else if (simdWidth * typeSize == 64) {
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(tmpVec[i], valuesVec[i]);
+          sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size);
+
+          // Second half
+          // Update the header with the coord
+          sel.curr.execWidth = 1;
+          sel.ADD(offsetx, offsetx, GenRegister::immud(32));
+
+          // Now read the data
+          sel.curr.execWidth = 8;
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(tmpVec[i], sel.getOffsetReg(valuesVec[i], 0, 32));
+          sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size);
+
+          // Move the reg to fit vector rule.
+        sel.pop();
+      } else NOT_IMPLEMENTED;
+
       return true;
     }
     DECL_CTOR(MediaBlockWriteInstruction, 1, 1);
-- 
2.7.4



More information about the Beignet mailing list